DISTRIBUTED DATA ANALYSIS AND MINING - PROJECT¶

By:

  • Bruno Limón
  • Rebecca Saitta
  • Charlotte Brimont
  • Giovanni Battista D’Orsi

IMPORTS¶

In [ ]:
# For Colab
# Install packages:
# !pip install pyspark
# !pip install sparknlp

# Or, setup pyspark environment
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash
--2022-12-16 19:12:47--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2022-12-16 19:12:47--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2022-12-16 19:12:48--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1191 (1.2K) [text/plain]
Saving to: ‘STDOUT’

Installing PySpark 3.2.1 and Spark NLP 4.2.5
-                   100%[===================>]   1.16K  --.-KB/s    in 0s      

2022-12-16 19:12:49 (44.7 MB/s) - written to stdout [1191/1191]

setup Colab for PySpark 3.2.1 and Spark NLP 4.2.5
     |████████████████████████████████| 281.4 MB 36 kB/s 
     |████████████████████████████████| 453 kB 78.3 MB/s 
     |████████████████████████████████| 198 kB 62.5 MB/s 
  Building wheel for pyspark (setup.py) ... done
In [ ]:
# General
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
np.set_printoptions(suppress=True) # to avoid scientific notation while printing

from wordcloud import WordCloud, STOPWORDS
from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import ConfusionMatrixDisplay, roc_curve, precision_recall_curve, average_precision_score, roc_auc_score

# Spark
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import FloatType

# Preprocessing
from pyspark.sql.functions import *
from pyspark.ml.stat import Correlation
from pyspark.ml.functions import vector_to_array
from pyspark.ml.feature import CountVectorizer, VectorAssembler, StringIndexer, RegexTokenizer, StopWordsRemover, OneHotEncoder

# Clustering
from pyspark.ml.clustering import LDA, KMeans, BisectingKMeans
from pyspark.ml.feature import PCA as PCAml, HashingTF, IDF

# Classification
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, LinearSVC, LinearSVCSummary

# Evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator, ClusteringEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics

# Tuning
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Frequent Pattern Mining
from pyspark.ml.fpm import FPGrowth
In [ ]:
# For Colab, mount drive, making sure to have the files as a shortcut in your main drive
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

1.0 DATA UNDERSTANDING & PREPROCESSING

In [ ]:
# Initialize a new Spark Session
sc = SparkContext(appName="DDAM_Group6_context", master="local[*]")

spark = SparkSession.builder \
     .master('local[*]') \
     .appName('DDAM_Group6_session') \
     .getOrCreate()

sqlContext = SQLContext(spark)
/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:77: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
In [ ]:
# Choose between colab or local path
path = '/content/drive/MyDrive/' # Colab
# path = 'Data/'                 # local

1.1 FAKE DATASET¶

In [ ]:
# loading the fake dataset
fake_data = spark.read.format("csv") .options(header='true', 
             multiLine = 'True',
             inferschema='true', 
             treatEmptyValuesAsNulls='true',
             escape='\"').load(path + 'Fake.csv')

print(fake_data.printSchema())
print(fake_data.count())
fake_data.show(5)
root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)

None
23471
+--------------------+--------------------+-------+-----------------+
|               title|                text|subject|             date|
+--------------------+--------------------+-------+-----------------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|
+--------------------+--------------------+-------+-----------------+
only showing top 5 rows

In [ ]:
#see if we have only unique titles and texts
print('nb unique title:', fake_data.select('title').distinct().count())
print('nb unique text:', fake_data.select('text').distinct().count())
print('nb rows:', fake_data.count())
nb unique title: 17897
nb unique text: 17449
nb rows: 23471
In [ ]:
# Looking at the distinct values in 'subject' from fake news
fake_data.select('subject').distinct().collect()
Out[ ]:
[Row(subject='US_News'),
 Row(subject='left-news'),
 Row(subject='politics'),
 Row(subject='Government News'),
 Row(subject='Middle-east'),
 Row(subject='News')]
In [ ]:
# Collecting contents of dataset to see a full example of the text column
print(fake_data.collect()[0].__getitem__('text'))
Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t even allow him to rise above the gutter long enough to wish the American citizens a happy new year!  Bishop Talbert Swan (@TalbertSwan) December 31, 2017no one likes you  Calvin (@calvinstowell) December 31, 2017Your impeachment would make 2018 a great year for America, but I ll also accept regaining control of Congress.  Miranda Yaver (@mirandayaver) December 31, 2017Do you hear yourself talk? When you have to include that many people that hate you you have to wonder? Why do the they all hate me?  Alan Sandoval (@AlanSandoval13) December 31, 2017Who uses the word Haters in a New Years wish??  Marlene (@marlene399) December 31, 2017You can t just say happy new year?  Koren pollitt (@Korencarpenter) December 31, 2017Here s Trump s New Year s Eve tweet from 2016.Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don t know what to do. Love!  Donald J. Trump (@realDonaldTrump) December 31, 2016This is nothing new for Trump. He s been doing this for years.Trump has directed messages to his  enemies  and  haters  for New Year s, Easter, Thanksgiving, and the anniversary of 9/11. pic.twitter.com/4FPAe2KypA  Daniel Dale (@ddale8) December 31, 2017Trump s holiday tweets are clearly not presidential.How long did he work at Hallmark before becoming President?  Steven Goodine (@SGoodine) December 31, 2017He s always been like this . . . the only difference is that in the last few years, his filter has been breaking down.  Roy Schulze (@thbthttt) December 31, 2017Who, apart from a teenager uses the term haters?  Wendy (@WendyWhistles) December 31, 2017he s a fucking 5 year old  Who Knows (@rainyday80) December 31, 2017So, to all the people who voted for this a hole thinking he would change once he got into power, you were wrong! 70-year-old men don t change and now he s a year older.Photo by Andrew Burton/Getty Images.

1.1 TRUE DATASET¶

In [ ]:
# loading the true dataset
true_data = spark.read.format("csv") .options(header='true', 
             multiLine = 'True',
             inferschema='true', 
             treatEmptyValuesAsNulls='true',
             escape='\"').load(path + 'True.csv')

print(true_data.printSchema())
print(true_data.count())
true_data.show(5)
root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)

None
21417
+--------------------+--------------------+------------+------------------+
|               title|                text|     subject|              date|
+--------------------+--------------------+------------+------------------+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 |
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 |
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 |
+--------------------+--------------------+------------+------------------+
only showing top 5 rows

In [ ]:
#see if we have only unique titles and texts
print('nb unique title:', true_data.select('title').distinct().count())
print('nb unique text:', true_data.select('text').distinct().count())
print('nb rows:', true_data.count())
nb unique title: 20826
nb unique text: 21192
nb rows: 21417
In [ ]:
# Looking at the distinct values in 'subject'
print(true_data.select('subject').distinct().collect())
[Row(subject='worldnews'), Row(subject='politicsNews')]
In [ ]:
# Collecting contents of dataset to see an example of text column
print(true_data.collect()[0].__getitem__('title'))
As U.S. budget fight looms, Republicans flip their fiscal script

1.3 FULL DATASET¶

In [ ]:
# Adding a 'fake' column to specify the label of the record when combining both datasets
fake_data = fake_data.withColumn("fake", lit(1))
true_data = true_data.withColumn("fake", lit(0))

# Putting both datasets together and looking at an overview of them
full_data = fake_data.union(true_data)
print(full_data.printSchema())
print(full_data.count())
full_data.show(5)
root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- fake: integer (nullable = false)

None
44888
+--------------------+--------------------+-------+-----------------+----+
|               title|                text|subject|             date|fake|
+--------------------+--------------------+-------+-----------------+----+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|   1|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|   1|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|   1|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|   1|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|   1|
+--------------------+--------------------+-------+-----------------+----+
only showing top 5 rows

In [ ]:
# Looking at the distinct values in 'subject' for the full dataset
full_data.select('subject').distinct().collect()
Out[ ]:
[Row(subject='US_News'),
 Row(subject='left-news'),
 Row(subject='politics'),
 Row(subject='Government News'),
 Row(subject='Middle-east'),
 Row(subject='News'),
 Row(subject='worldnews'),
 Row(subject='politicsNews')]
In [ ]:
# Brief overview of data
full_data.describe().show()
+-------+--------------------+--------------------+---------------+------------------+-------------------+
|summary|               title|                text|        subject|              date|               fake|
+-------+--------------------+--------------------+---------------+------------------+-------------------+
|  count|               44888|               44888|          44888|             44888|              44888|
|   mean|                null|                null|           null|              null| 0.5228791659240777|
| stddev|                null|                null|           null|              null|0.49948183314938155|
|    min|\r\r\r\r\r\r\nDon...|                    |Government News|         14-Feb-18|                  0|
|    max|“You’re Not Welco...|youngers these da...|      worldnews|September 9, 2017 |                  1|
+-------+--------------------+--------------------+---------------+------------------+-------------------+

In [ ]:
# Looking for Nulls and NaNs
full_data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in full_data.columns]).show()
+-----+----+-------+----+----+
|title|text|subject|date|fake|
+-----+----+-------+----+----+
|    0|   0|      0|   0|   0|
+-----+----+-------+----+----+

In [ ]:
# Creating view to query from
full_data.createOrReplaceTempView("view")

# Query to get total number of elements for each category
total_subject = sqlContext.sql("""SELECT subject, COUNT(*) as total 
                                    FROM view 
                                    GROUP BY subject 
                                    ORDER BY total desc""")

# Query to get total number of elements for each label
total_label = sqlContext.sql("""SELECT fake, COUNT(*) as total 
                                    FROM view 
                                    GROUP BY fake 
                                    ORDER BY total desc""")

# Query to get total number of elements for each subject, divided by label
total_subject_label = sqlContext.sql("""SELECT subject, COUNT(*) as total, sum(fake) as fake, COUNT(*) - sum(fake) as true
                                    FROM view 
                                    GROUP BY subject, fake 
                                    ORDER BY total desc""")

print(total_subject.show())
print(total_label.show())
print(total_subject_label.show())

# Passing spark DF to pandas DF to plot more easily later
total_subject = total_subject.toPandas()
total_label = total_label.toPandas()
+---------------+-----+
|        subject|total|
+---------------+-----+
|   politicsNews|11272|
|      worldnews|10145|
|           News| 9050|
|       politics| 6836|
|      left-news| 4456|
|Government News| 1568|
|        US_News|  783|
|    Middle-east|  778|
+---------------+-----+

None
+----+-----+
|fake|total|
+----+-----+
|   1|23471|
|   0|21417|
+----+-----+

None
+---------------+-----+----+-----+
|        subject|total|fake| true|
+---------------+-----+----+-----+
|   politicsNews|11272|   0|11272|
|      worldnews|10145|   0|10145|
|           News| 9050|9050|    0|
|       politics| 6836|6836|    0|
|      left-news| 4456|4456|    0|
|Government News| 1568|1568|    0|
|        US_News|  783| 783|    0|
|    Middle-east|  778| 778|    0|
+---------------+-----+----+-----+

None
In [ ]:
# News by subject
plt.bar(total_subject['subject'], total_subject['total'], color='darkslateblue') 
plt.title("Amount of news by subject")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig('SubjectBefore.png', dpi=600) 
plt.show()
In [ ]:
# News by label
plt.bar(['Fake','Real'], total_label['total']) 
plt.title("Total news by label")
plt.show()

1.4 PREPROCESSING¶

1.4.1 SUBJECT & DATE

SUBJECT

Since "worldnews" and "politicsNews" are subjects that appear only on the "True" dataset, in order to avoid bias we decided to transorm them into subjects already appearing in the "Fake" dataset, transforming "worldnews" into "news" and "politicsnews" into "politics", expecting them to match just fine

In [ ]:
# Using regex_replace to transform the subjects
full_data_pre = full_data.withColumn("subject",regexp_replace(col('subject'), 'worldnews', 'News')) \
               .withColumn("subject",regexp_replace(col('subject'), 'politicsNews', 'politics'))
full_data_pre.show(5)
+--------------------+--------------------+-------+-----------------+----+
|               title|                text|subject|             date|fake|
+--------------------+--------------------+-------+-----------------+----+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|   1|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|   1|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|   1|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|   1|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|   1|
+--------------------+--------------------+-------+-----------------+----+
only showing top 5 rows

In [ ]:
# Confirming the undesired subjects are no longer present in the dataset
full_data_pre.select('subject').distinct().collect()
Out[ ]:
[Row(subject='US_News'),
 Row(subject='left-news'),
 Row(subject='politics'),
 Row(subject='Government News'),
 Row(subject='Middle-east'),
 Row(subject='News')]
In [ ]:
# Querying again for the total amount of records with each subject, we can now see the difference in "News" and "Politics"
full_data_pre.createOrReplaceTempView("view")
total_subject = sqlContext.sql("""SELECT subject, COUNT(*) as total 
                                    FROM view 
                                    GROUP BY subject 
                                    ORDER BY total desc""")
print(total_subject.show())
total_subject = total_subject.toPandas()
+---------------+-----+
|        subject|total|
+---------------+-----+
|           News|19195|
|       politics|18108|
|      left-news| 4456|
|Government News| 1568|
|        US_News|  783|
|    Middle-east|  778|
+---------------+-----+

None
In [ ]:
# News by subject
plt.bar(total_subject['subject'], total_subject['total']) 
plt.title("Amount of news by subject")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig('SubjectAfter.png', dpi=600)
plt.show()

DATE

Regarding "Date", the idea is to have a consistent and accessible format to use for analysis. As of now, there are seem to be different formats present in the dataset, like full months and abbreviated ones, which can complicate their use.

In [ ]:
# 1st step is to remove the comma from the date
data_date = full_data_pre.withColumn("date",regexp_replace(col('date'), ',', ''))

# Then splitting each element into its respective column
data_date = data_date.withColumn('year', split(data_date['date'], ' ').getItem(2)) \
                     .withColumn('month', split(data_date['date'], ' ').getItem(0)) \
                     .withColumn('day', split(data_date['date'], ' ').getItem(1))
print(data_date.show(1))

# Tranforming abbreviated months, e.g. "Dec" into their full form, "December", then taking this full form into a number, "12"
data_date = data_date.withColumn('month', when((length(data_date.month) > 3) | (data_date.month == "May"), data_date.month)
                     .otherwise(from_unixtime(unix_timestamp(data_date.month,'MMM'),'MMMM'))) \
                     .withColumn("month",from_unixtime(unix_timestamp(col("month"),'MMMM'),'MM'))
print(data_date.show(1))
+--------------------+--------------------+-------+----------------+----+----+--------+---+
|               title|                text|subject|            date|fake|year|   month|day|
+--------------------+--------------------+-------+----------------+----+----+--------+---+
| Donald Trump Sen...|Donald Trump just...|   News|December 31 2017|   1|2017|December| 31|
+--------------------+--------------------+-------+----------------+----+----+--------+---+
only showing top 1 row

None
+--------------------+--------------------+-------+----------------+----+----+-----+---+
|               title|                text|subject|            date|fake|year|month|day|
+--------------------+--------------------+-------+----------------+----+----+-----+---+
| Donald Trump Sen...|Donald Trump just...|   News|December 31 2017|   1|2017|   12| 31|
+--------------------+--------------------+-------+----------------+----+----+-----+---+
only showing top 1 row

None
In [ ]:
# Looking for Nulls and NaNs before producing the final output for date, to see if anything went wrong
data_date.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data_date.columns]).show()
data_date.filter(col("month").isNull()).show(5)
+-----+----+-------+----+----+----+-----+---+
|title|text|subject|date|fake|year|month|day|
+-----+----+-------+----+----+----+-----+---+
|    0|   0|      0|   0|   0|  35|   35| 35|
+-----+----+-------+----+----+----+-----+---+

+--------------------+--------------------+--------+---------+----+----+-----+----+
|               title|                text| subject|     date|fake|year|month| day|
+--------------------+--------------------+--------+---------+----+----+-----+----+
|Democrat Senator ...|According to The ...|politics|19-Feb-18|   1|null| null|null|
|MSNBC ANCHOR Flab...|If we protect eve...|politics|19-Feb-18|   1|null| null|null|
|WATCH: SNOWFLAKES...|Ami Horowitz is f...|politics|19-Feb-18|   1|null| null|null|
|JUST IN: BADASS G...|Just one more rem...|politics|18-Feb-18|   1|null| null|null|
|DOJ’s JEFF SESSIO...|Thank goodnesss J...|politics|18-Feb-18|   1|null| null|null|
+--------------------+--------------------+--------+---------+----+----+-----+----+
only showing top 5 rows

In [ ]:
# Indeed, the previous cell shows us that 35 rows got NULL values, this due to a 3rd and unexpected format, a numerical one
# Given the small amount of them and for simplicity, dropping the rows is the chosen approach
data_date = data_date.dropna()
data_date.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data_date.columns]).show()
+-----+----+-------+----+----+----+-----+---+
|title|text|subject|date|fake|year|month|day|
+-----+----+-------+----+----+----+-----+---+
|    0|   0|      0|   0|   0|   0|    0|  0|
+-----+----+-------+----+----+----+-----+---+

In [ ]:
# Finalizing the date preprocessing by putting together the values into their final column
data_date = data_date.select('title','text','subject', 
                             concat_ws('-',data_date.year,data_date.month,data_date.day).alias("date"),
                             'fake')
print(data_date.show(5))
+--------------------+--------------------+-------+----------+----+
|               title|                text|subject|      date|fake|
+--------------------+--------------------+-------+----------+----+
| Donald Trump Sen...|Donald Trump just...|   News|2017-12-31|   1|
| Drunk Bragging T...|House Intelligenc...|   News|2017-12-31|   1|
| Sheriff David Cl...|On Friday, it was...|   News|2017-12-30|   1|
| Trump Is So Obse...|On Christmas day,...|   News|2017-12-29|   1|
| Pope Francis Jus...|Pope Francis used...|   News|2017-12-25|   1|
+--------------------+--------------------+-------+----------+----+
only showing top 5 rows

None

1.4.2 PREPARING FOR NLP¶

In [ ]:
# Removing numbers and symbols to remain only with words
data_nlp = data_date

for col_name in ["title", "text"]:
    data_nlp = data_nlp.withColumn(col_name,regexp_replace(col(col_name), '\d+', ''))

data_nlp.select('text').show(1, truncate=False)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.   will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year.  will be a great year for America!  Donald J. Trump (@realDonaldTrump) December , Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t even allow him to rise above the gutter long enough to wish the American citizens a happy new year!  Bishop Talbert Swan (@TalbertSwan) December , no one likes you  Calvin (@calvinstowell) December , Your impeachment would make  a great year for America, but I ll also accept regaining control of Congress.  Miranda Yaver (@mirandayaver) December , Do you hear yourself talk? When you have to include that many people that hate you you have to wonder? Why do the they all hate me?  Alan Sandoval (@AlanSandoval) December , Who uses the word Haters in a New Years wish??  Marlene (@marlene) December , You can t just say happy new year?  Koren pollitt (@Korencarpenter) December , Here s Trump s New Year s Eve tweet from .Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don t know what to do. Love!  Donald J. Trump (@realDonaldTrump) December , This is nothing new for Trump. He s been doing this for years.Trump has directed messages to his  enemies  and  haters  for New Year s, Easter, Thanksgiving, and the anniversary of /. pic.twitter.com/FPAeKypA  Daniel Dale (@ddale) December , Trump s holiday tweets are clearly not presidential.How long did he work at Hallmark before becoming President?  Steven Goodine (@SGoodine) December , He s always been like this . . . the only difference is that in the last few years, his filter has been breaking down.  Roy Schulze (@thbthttt) December , Who, apart from a teenager uses the term haters?  Wendy (@WendyWhistles) December , he s a fucking  year old  Who Knows (@rainyday) December , So, to all the people who voted for this a hole thinking he would change once he got into power, you were wrong! -year-old men don t change and now he s a year older.Photo by Andrew Burton/Getty Images.|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row

In [ ]:
# Tokenizer
for col_name in ["title", "text"]:
  regex_tokenizer = RegexTokenizer(inputCol=col_name, outputCol=col_name+"Token", pattern="\\W")
  data_nlp = regex_tokenizer.transform(data_nlp)

# Adding id to keep track of records later on
data_nlp = data_nlp.withColumn("id", monotonically_increasing_id())

data_nlp = data_nlp.select('id', 'titleToken', 'textToken', 'subject', 'date', 'fake')
data_nlp.show(5)
+---+--------------------+--------------------+-------+----------+----+
| id|          titleToken|           textToken|subject|      date|fake|
+---+--------------------+--------------------+-------+----------+----+
|  0|[donald, trump, s...|[donald, trump, j...|   News|2017-12-31|   1|
|  1|[drunk, bragging,...|[house, intellige...|   News|2017-12-31|   1|
|  2|[sheriff, david, ...|[on, friday, it, ...|   News|2017-12-30|   1|
|  3|[trump, is, so, o...|[on, christmas, d...|   News|2017-12-29|   1|
|  4|[pope, francis, j...|[pope, francis, u...|   News|2017-12-25|   1|
+---+--------------------+--------------------+-------+----------+----+
only showing top 5 rows

In [ ]:
# Removing stopwords
for col_name in ["titleToken", "textToken"]:
  remover = StopWordsRemover(inputCol=col_name, outputCol=col_name+"s")
  data_nlp = remover.transform(data_nlp)

data_nlp = data_nlp.select('id', 'titleTokens', 'textTokens', 'subject', 'date', 'fake')
(data_nlp.select('titleTokens').collect()[0][0])[0:10]
Out[ ]:
['donald',
 'trump',
 'sends',
 'embarrassing',
 'new',
 'year',
 'eve',
 'message',
 'disturbing']
In [ ]:
# Turning subject strings into numeric values, to later apply one hot encoding
indexer = StringIndexer(inputCol='subject', outputCol='numericSubject')
indexer_model = indexer.fit(data_nlp)
data_nlp_vec = indexer_model.transform(data_nlp)

# Applying one-hot encoding to turn categorical values in subject into a vector representing the diferent categories
OHE = OneHotEncoder(inputCol="numericSubject", outputCol="subject_vec", dropLast=False)
OHE_model = OHE.fit(data_nlp_vec)
data_nlp_vec = OHE_model.transform(data_nlp_vec).select('id',"titleTokens","textTokens","subject_vec","date","fake")
data_nlp_vec.show(5)
+---+--------------------+--------------------+-------------+----------+----+
| id|         titleTokens|          textTokens|  subject_vec|      date|fake|
+---+--------------------+--------------------+-------------+----------+----+
|  0|[donald, trump, s...|[donald, trump, c...|(6,[0],[1.0])|2017-12-31|   1|
|  1|[drunk, bragging,...|[house, intellige...|(6,[0],[1.0])|2017-12-31|   1|
|  2|[sheriff, david, ...|[friday, revealed...|(6,[0],[1.0])|2017-12-30|   1|
|  3|[trump, obsessed,...|[christmas, day, ...|(6,[0],[1.0])|2017-12-29|   1|
|  4|[pope, francis, c...|[pope, francis, u...|(6,[0],[1.0])|2017-12-25|   1|
+---+--------------------+--------------------+-------------+----------+----+
only showing top 5 rows

In [ ]:
# In order to vectorize the date column, it is first split and put into separate columns
data_nlp_vec = data_nlp_vec.withColumn("date",regexp_replace(col('date'), '-', ' '))
data_nlp_vec = data_nlp_vec.withColumn('year', split(data_nlp_vec['date'], ' ').getItem(0)) \
                     .withColumn('month', split(data_nlp_vec['date'], ' ').getItem(1)) \
                     .withColumn('day', split(data_nlp_vec['date'], ' ').getItem(2))

# Casting as int to feed into VectorAssembler later on to vectorize
data_nlp_vec = data_nlp_vec.withColumn("year",col("year").cast('int')) \
                           .withColumn("month",col("month").cast('int')) \
                           .withColumn("day",col("day").cast('int'))
data_nlp_vec.show(5)
+---+--------------------+--------------------+-------------+----------+----+----+-----+---+
| id|         titleTokens|          textTokens|  subject_vec|      date|fake|year|month|day|
+---+--------------------+--------------------+-------------+----------+----+----+-----+---+
|  0|[donald, trump, s...|[donald, trump, c...|(6,[0],[1.0])|2017 12 31|   1|2017|   12| 31|
|  1|[drunk, bragging,...|[house, intellige...|(6,[0],[1.0])|2017 12 31|   1|2017|   12| 31|
|  2|[sheriff, david, ...|[friday, revealed...|(6,[0],[1.0])|2017 12 30|   1|2017|   12| 30|
|  3|[trump, obsessed,...|[christmas, day, ...|(6,[0],[1.0])|2017 12 29|   1|2017|   12| 29|
|  4|[pope, francis, c...|[pope, francis, u...|(6,[0],[1.0])|2017 12 25|   1|2017|   12| 25|
+---+--------------------+--------------------+-------------+----------+----+----+-----+---+
only showing top 5 rows

In [ ]:
# Constructing the vector from the data obtained before
assembler = VectorAssembler(inputCols=['year','month','day'], outputCol="date_vec")
data_nlp_vec = assembler.transform(data_nlp_vec).select('id','titleTokens','textTokens','subject_vec','date_vec','fake')
data_nlp_vec.show(5)
+---+--------------------+--------------------+-------------+------------------+----+
| id|         titleTokens|          textTokens|  subject_vec|          date_vec|fake|
+---+--------------------+--------------------+-------------+------------------+----+
|  0|[donald, trump, s...|[donald, trump, c...|(6,[0],[1.0])|[2017.0,12.0,31.0]|   1|
|  1|[drunk, bragging,...|[house, intellige...|(6,[0],[1.0])|[2017.0,12.0,31.0]|   1|
|  2|[sheriff, david, ...|[friday, revealed...|(6,[0],[1.0])|[2017.0,12.0,30.0]|   1|
|  3|[trump, obsessed,...|[christmas, day, ...|(6,[0],[1.0])|[2017.0,12.0,29.0]|   1|
|  4|[pope, francis, c...|[pope, francis, u...|(6,[0],[1.0])|[2017.0,12.0,25.0]|   1|
+---+--------------------+--------------------+-------------+------------------+----+
only showing top 5 rows

In [ ]:
# Using CountVectorizer, to transform the title and contents of the news articles into vectors that represent their features
for col_name in ["titleTokens", "textTokens"]:
  count_vectorizer = CountVectorizer(inputCol=col_name, outputCol=col_name+'_vecfull')
  cv_model = count_vectorizer.fit(data_nlp_vec)
  data_nlp_vec = cv_model.transform(data_nlp_vec)

data_nlp_vec_fullfeatures = data_nlp_vec.select('id',"titleTokens_vecfull","textTokens_vecfull","subject_vec","date_vec","fake")
  
for col_name in ["titleTokens", "textTokens"]:
  count_vectorizer = CountVectorizer(inputCol=col_name, outputCol=col_name+'_vec', minDF=.01, maxDF=.9)
  cv_model = count_vectorizer.fit(data_nlp_vec)
  data_nlp_vec = cv_model.transform(data_nlp_vec)

data_nlp_vec = data_nlp_vec.select('id',"titleTokens_vec","textTokens_vec","subject_vec","date_vec","fake")
data_nlp_vec.show(5)
+---+--------------------+--------------------+-------------+------------------+----+
| id|     titleTokens_vec|      textTokens_vec|  subject_vec|          date_vec|fake|
+---+--------------------+--------------------+-------------+------------------+----+
|  0|(88,[0,9,24],[1.0...|(3024,[0,2,4,5,7,...|(6,[0],[1.0])|[2017.0,12.0,31.0]|   1|
|  1|(88,[0,43],[1.0,1...|(3024,[0,1,2,8,10...|(6,[0],[1.0])|[2017.0,12.0,31.0]|   1|
|  2|     (88,[51],[1.0])|(3024,[0,4,5,12,1...|(6,[0],[1.0])|[2017.0,12.0,30.0]|   1|
|  3|(88,[0,4],[1.0,1.0])|(3024,[0,2,4,7,11...|(6,[0],[1.0])|[2017.0,12.0,29.0]|   1|
|  4|(88,[0,24,64],[1....|(3024,[0,1,4,5,12...|(6,[0],[1.0])|[2017.0,12.0,25.0]|   1|
+---+--------------------+--------------------+-------------+------------------+----+
only showing top 5 rows

In [ ]:
# Putting together all previous vectors into a final one
assembler = VectorAssembler(inputCols=['titleTokens_vecfull',
                                       'textTokens_vecfull',
                                       'subject_vec',
                                       'date_vec'], 
                            outputCol="full_features")
data_nlp_vec_fullfeatures = assembler.transform(data_nlp_vec_fullfeatures).select('id','titleTokens_vecfull','textTokens_vecfull','subject_vec','date_vec','full_features','fake')

assembler = VectorAssembler(inputCols=['titleTokens_vec',
                                       'textTokens_vec',
                                       'subject_vec',
                                       'date_vec'], 
                            outputCol="full_features")
data_nlp_vec = assembler.transform(data_nlp_vec).select('id','titleTokens_vec','textTokens_vec','subject_vec','date_vec','full_features','fake')
data_nlp_vec.show(5)
+---+--------------------+--------------------+-------------+------------------+--------------------+----+
| id|     titleTokens_vec|      textTokens_vec|  subject_vec|          date_vec|       full_features|fake|
+---+--------------------+--------------------+-------------+------------------+--------------------+----+
|  0|(88,[0,9,24],[1.0...|(3024,[0,2,4,5,7,...|(6,[0],[1.0])|[2017.0,12.0,31.0]|(3121,[0,9,24,88,...|   1|
|  1|(88,[0,43],[1.0,1...|(3024,[0,1,2,8,10...|(6,[0],[1.0])|[2017.0,12.0,31.0]|(3121,[0,43,88,89...|   1|
|  2|     (88,[51],[1.0])|(3024,[0,4,5,12,1...|(6,[0],[1.0])|[2017.0,12.0,30.0]|(3121,[51,88,92,9...|   1|
|  3|(88,[0,4],[1.0,1.0])|(3024,[0,2,4,7,11...|(6,[0],[1.0])|[2017.0,12.0,29.0]|(3121,[0,4,88,90,...|   1|
|  4|(88,[0,24,64],[1....|(3024,[0,1,4,5,12...|(6,[0],[1.0])|[2017.0,12.0,25.0]|(3121,[0,24,64,88...|   1|
+---+--------------------+--------------------+-------------+------------------+--------------------+----+
only showing top 5 rows

In [ ]:
data_nlp_vec_fullfeatures.show(5)
+---+--------------------+--------------------+-------------+------------------+--------------------+----+
| id| titleTokens_vecfull|  textTokens_vecfull|  subject_vec|          date_vec|       full_features|fake|
+---+--------------------+--------------------+-------------+------------------+--------------------+----+
|  0|(20311,[0,9,24,10...|(114671,[0,2,4,5,...|(6,[0],[1.0])|[2017.0,12.0,31.0]|(134991,[0,9,24,1...|   1|
|  1|(20311,[0,43,278,...|(114671,[0,1,2,8,...|(6,[0],[1.0])|[2017.0,12.0,31.0]|(134991,[0,43,278...|   1|
|  2|(20311,[51,472,81...|(114671,[0,4,5,12...|(6,[0],[1.0])|[2017.0,12.0,30.0]|(134991,[51,472,8...|   1|
|  3|(20311,[0,4,311,4...|(114671,[0,2,4,7,...|(6,[0],[1.0])|[2017.0,12.0,29.0]|(134991,[0,4,311,...|   1|
|  4|(20311,[0,24,64,4...|(114671,[0,1,4,5,...|(6,[0],[1.0])|[2017.0,12.0,25.0]|(134991,[0,24,64,...|   1|
+---+--------------------+--------------------+-------------+------------------+--------------------+----+
only showing top 5 rows

1.5 FURTHER EXPLORING¶

In [ ]:
# Putting together subject and fake variables to calculate correlation matrix
assembler = VectorAssembler(inputCols=['subject_vec',
                                       'date_vec',
                                       'fake'], 
                            outputCol="corr_vector")
data_corr = assembler.transform(data_nlp_vec).select('corr_vector')
data_corr.show(5)
+--------------------+
|         corr_vector|
+--------------------+
|(10,[0,6,7,8,9],[...|
|(10,[0,6,7,8,9],[...|
|(10,[0,6,7,8,9],[...|
|(10,[0,6,7,8,9],[...|
|(10,[0,6,7,8,9],[...|
+--------------------+
only showing top 5 rows

In [ ]:
# Columns related to the vectors used for the correlation
cols = ['News', 'Politics', 'Left-news', 'Gov-News', 'US-News', 'Middle-east', 'Year', 'Month', 'Day', 'Fake'] 

# Computing correlation matrix from the vector previously assembled
matrix = Correlation.corr(data_corr, 'corr_vector', method='spearman').collect()[0][0] 
corr_matrix = matrix.toArray().tolist() 
corr_matrix_df = pd.DataFrame(data=corr_matrix, columns=cols) 

# Plotting correlation matrix
plt.figure(figsize=(16,4))  
sns.heatmap(corr_matrix_df, 
            xticklabels=corr_matrix_df.columns.values,
            yticklabels=corr_matrix_df.columns.values,  
            cmap="winter", 
            annot=True)
plt.title('Correlation betweeen subject, date and target variable')
plt.tight_layout()
plt.savefig('CorrMatrixSubject.png', dpi=600) 
plt.show()
In [ ]:
# Dividing the dataset to separate by true and fake
dates_fake = [data[0] for data in data_nlp.select('date').filter(col("fake") == 1).collect()]
dates_true = [data[0] for data in data_nlp.select('date').filter(col("fake") == 0).collect()]

dates_fake_plot = []
dates_true_plot = []

# Converting strings into date format to plot
for i in range(len(dates_fake)):
  temp = datetime.strptime(dates_fake[i], '%Y-%m-%d')
  dates_fake_plot.append(temp)

for i in range(len(dates_true)):
  temp = datetime.strptime(dates_true[i], '%Y-%m-%d')
  dates_true_plot.append(temp)
In [ ]:
# Plotting distribution of dates
fig, ax = plt.subplots(figsize=(10,3))
n, bins, patches = ax.hist([dates_true_plot, dates_fake_plot], bins=150, stacked=True, color=["tab:blue", "tab:orange"])
plt.legend({'True': "tab:blue", 'Fake': "tab:orange"})
plt.title('Frequency distribution of dates')
fig.autofmt_xdate()
fig.tight_layout()
plt.savefig('PlotDates.png', dpi=600) 
plt.show()
/usr/local/lib/python3.8/dist-packages/numpy/core/fromnumeric.py:3208: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  return asarray(a).size
/usr/local/lib/python3.8/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))
In [ ]:
# Collecting all tokens from the previous dataset in order to put them into a single string 
# and visualize a wordcloud with the most prominent words
words = [data[0] for data in data_nlp.select('textTokens').collect()]
full_words = []
for i in range(len(words)):
    full_words.append(" ".join(words[i]))

wordcloud_words = " ".join(full_words)
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .collect()

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))
In [ ]:
len(word_frequency[0])
Out[ ]:
114671
In [ ]:
# Word Frequency Full dataset
x = word_frequency[1]
y = word_frequency[0]

fig, ax = plt.subplots()
ax.plot(x, y) 
ax.set_title("Word Frequency distribution in Full Dataset")
ax.set_xticks(x[::8000])
ax.set_xticklabels(x[::8000], rotation=45, ha="right")
plt.tight_layout()
plt.savefig('WordFrequency.png', dpi=600)
plt.show()
In [ ]:
plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(wordcloud_words)
plt.axis('off')
wordcloud.to_file("WordcloudFull.png")
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f7cd5910400>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))
In [ ]:
# Word Frequency Full dataset
plt.bar(word_frequency[1], word_frequency[0]) 
plt.title("Top 10 frequent words in full dataset")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig('10FrequentFull.png', dpi=600) 
plt.show()
In [ ]:
fake_data = data_nlp.select("textTokens").filter(col("fake") == 1)
print(fake_data.count())
fake_data.show(5)
23436
+--------------------+
|          textTokens|
+--------------------+
|[donald, trump, c...|
|[house, intellige...|
|[friday, revealed...|
|[christmas, day, ...|
|[pope, francis, u...|
+--------------------+
only showing top 5 rows

In [ ]:
fake_words = [data[0] for data in fake_data.collect()]
fake_full_words = []
for i in range(len(fake_words)):
    fake_full_words.append(" ".join(fake_words[i]))

fake_wordcloud_words = " ".join(fake_full_words)
fake_wordcloud_words[0:100]
Out[ ]:
'donald trump couldn wish americans happy new year leave instead give shout enemies haters dishonest '
In [ ]:
plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(fake_wordcloud_words)
plt.axis('off')
wordcloud.to_file("WordcloudFake.png")
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f7cd9703fa0>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of fake words
fake_words_list = []
for i in range(len(fake_words)):
    fake_words_list += fake_words[i]

# Creating an RDD with fake words
fake_words_rdd = sc.parallelize(fake_words_list)

# Counting occurences for each word
fake_wordcount = fake_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
fake_word_frequency = list(map(list, zip(*fake_wordcount)))
In [ ]:
# Word Frequency fake dataset
plt.bar(fake_word_frequency[1], fake_word_frequency[0]) 
plt.title("Top 10 frequent words in fake dataset")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig('10FrequentFake.png', dpi=600) 
plt.show()
In [ ]:
real_data = data_nlp.select("textTokens").filter(col("fake") == 0)
print(real_data.count())
real_data.show(5)
21417
+--------------------+
|          textTokens|
+--------------------+
|[washington, reut...|
|[washington, reut...|
|[washington, reut...|
|[washington, reut...|
|[seattle, washing...|
+--------------------+
only showing top 5 rows

In [ ]:
real_words = [data[0] for data in real_data.collect()]
real_full_words = []
for i in range(len(real_words)):
    real_full_words.append(" ".join(real_words[i]))

real_wordcloud_words = " ".join(real_full_words)
real_wordcloud_words[0:100]
Out[ ]:
'washington reuters head conservative republican faction u congress voted month huge expansion nation'
In [ ]:
plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(real_wordcloud_words)
plt.axis('off')
wordcloud.to_file("WordcloudTrue.png")
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f7cd590b970>
In [ ]:
# Putting together all lists of real words to form a single list containing occurrences of real words
real_words_list = []
for i in range(len(real_words)):
    real_words_list += real_words[i]

# Creating an RDD with real words
real_words_rdd = sc.parallelize(real_words_list)

# Counting occurences for each word
real_wordcount = real_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
real_word_frequency = list(map(list, zip(*real_wordcount)))
In [ ]:
# Word Frequency real dataset
plt.bar(real_word_frequency[1], real_word_frequency[0]) 
plt.title("Word Frequency in real dataset")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig('10FrequentTrue.png', dpi=600)
plt.show()

2.0 TOPIC MODELLING¶

In [ ]:
# Selecting the vectorized text variable with full features
data_cluster = data_nlp_vec_fullfeatures.select('id','textTokens_vecfull')
data_cluster.show(5)
+---+--------------------+
| id|  textTokens_vecfull|
+---+--------------------+
|  0|(114671,[0,2,4,5,...|
|  1|(114671,[0,1,2,8,...|
|  2|(114671,[0,4,5,12...|
|  3|(114671,[0,2,4,7,...|
|  4|(114671,[0,1,4,5,...|
+---+--------------------+
only showing top 5 rows

2.1 K-MEANS¶

In [ ]:
# Computing silhouette score the get optimal number of clusters with elbow method
silhouette_scores=[]
evaluator = ClusteringEvaluator(featuresCol='textTokens_vecfull', \
metricName='silhouette', distanceMeasure='squaredEuclidean')

for K in range(2,11):
  KMeans_=KMeans(featuresCol='textTokens_vecfull', k=K)
  KMeans_fit=KMeans_.fit(data_cluster)
  KMeans_transform=KMeans_fit.transform(data_cluster) 
  evaluation_score=evaluator.evaluate(KMeans_transform)
  silhouette_scores.append(evaluation_score)
In [ ]:
plt.figure(figsize=(10, 3))
plt.plot(range(2,11), silhouette_scores)
plt.title('Silhouette Scores for K-Means')
plt.xlabel('Number of Clusters')
plt.savefig('K-Means_silhouette.png', dpi=600)
In [ ]:
kmeans = KMeans(featuresCol='textTokens_vecfull').setK(4).setSeed(1)
modelk = kmeans.fit(data_cluster)

# Make predictions
predictions = modelk.transform(data_cluster)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator(featuresCol='textTokens_vecfull')

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = modelk.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
Silhouette with squared euclidean distance = 0.2083229366850883
Cluster Centers: 
[0.98766467 1.68581772 0.79220997 ... 0.00003352 0.00003352 0.00003352]
[11.56571506  3.2193644   2.2806356  ...  0.          0.
  0.        ]
[2.30701278 7.76936668 1.86028812 ... 0.         0.         0.        ]
[8.90677966 5.99152542 7.11440678 ... 0.         0.         0.        ]
In [ ]:
# Viewing the label assigned to each record belonging to which cluster
data_cluster_label = modelk.transform(data_cluster)
data_cluster_label.show(5)
+---+--------------------+----------+
| id|  textTokens_vecfull|prediction|
+---+--------------------+----------+
|  0|(114671,[0,2,4,5,...|         1|
|  1|(114671,[0,1,2,8,...|         0|
|  2|(114671,[0,4,5,12...|         0|
|  3|(114671,[0,2,4,7,...|         1|
|  4|(114671,[0,1,4,5,...|         0|
+---+--------------------+----------+
only showing top 5 rows

In [ ]:
data_cluster_label = data_cluster_label.join(data_nlp, 'id')
data_cluster_label = data_cluster_label.select('id', 'textTokens', 'prediction')
data_cluster_label.show(5)
+----+--------------------+----------+
|  id|          textTokens|prediction|
+----+--------------------+----------+
|  26|[ronald, reagan, ...|         0|
|  29|[sen, al, franken...|         1|
| 474|[u, sen, ted, cru...|         0|
| 964|[say, people, are...|         0|
|1677|[seem, democrats,...|         0|
+----+--------------------+----------+
only showing top 5 rows

In [ ]:
# Separating clusters and counting them
dk0 = data_cluster_label.filter(data_cluster_label.prediction == 0)
dk1 = data_cluster_label.filter(data_cluster_label.prediction == 1)
dk2 = data_cluster_label.filter(data_cluster_label.prediction == 2)
dk3 = data_cluster_label.filter(data_cluster_label.prediction == 3)

print('Amount of records in cluster 0:', dk0.count())
print('Amount of records in cluster 1:', dk1.count())
print('Amount of records in cluster 2:', dk2.count())
print('Amount of records in cluster 3:', dk3.count())
Amount of records in cluster 0: 29764
Amount of records in cluster 1: 7416
Amount of records in cluster 2: 7437
Amount of records in cluster 3: 236
In [ ]:
# Collecting all tokens from the previous dataset in order to put them into a single string 
# and visualize a wordcloud with the most prominent words
words = [cl_data[0] for cl_data in dk0.select('textTokens').collect()]
full_words = []
for i in range(len(words)):
    full_words.append(" ".join(words[i]))

wordcloud_words = " ".join(full_words)


# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .collect()

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))


plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(wordcloud_words)
plt.axis('off')
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f774be43cd0>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))

# Word Frequency Full dataset
plt.bar(word_frequency[1], word_frequency[0]) 
plt.title("Word Frequency in full dataset")
plt.xticks(rotation=45, ha="right")
plt.show()
In [ ]:
# Collecting all tokens from the previous dataset in order to put them into a single string 
# and visualize a wordcloud with the most prominent words
words = [cl_data[0] for cl_data in dk1.select('textTokens').collect()]
full_words = []
for i in range(len(words)):
    full_words.append(" ".join(words[i]))

wordcloud_words = " ".join(full_words)


# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .collect()

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))


plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(wordcloud_words)
plt.axis('off')
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f77cc0c64f0>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))

# Word Frequency Full dataset
plt.bar(word_frequency[1], word_frequency[0]) 
plt.title("Word Frequency in full dataset")
plt.xticks(rotation=45, ha="right")
plt.show()
In [ ]:
# Collecting all tokens from the previous dataset in order to put them into a single string 
# and visualize a wordcloud with the most prominent words
words = [cl_data[0] for cl_data in dk2.select('textTokens').collect()]
full_words = []
for i in range(len(words)):
    full_words.append(" ".join(words[i]))

wordcloud_words = " ".join(full_words)


# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .collect()

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))


plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(wordcloud_words)
plt.axis('off')
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f7745493940>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))

# Word Frequency Full dataset
plt.bar(word_frequency[1], word_frequency[0]) 
plt.title("Word Frequency in full dataset")
plt.xticks(rotation=45, ha="right")
plt.show()
In [ ]:
# Collecting all tokens from the previous dataset in order to put them into a single string 
# and visualize a wordcloud with the most prominent words
words = [cl_data[0] for cl_data in dk3.select('textTokens').collect()]
full_words = []
for i in range(len(words)):
    full_words.append(" ".join(words[i]))

wordcloud_words = " ".join(full_words)


# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .collect()

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))


plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(wordcloud_words)
plt.axis('off')
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f77c92d2f70>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))

# Word Frequency Full dataset
plt.bar(word_frequency[1], word_frequency[0]) 
plt.title("Word Frequency in full dataset")
plt.xticks(rotation=45, ha="right")
plt.show()

2.2 BISECTING K-MEANS¶

In [ ]:
silhouette_scores=[]
evaluator = ClusteringEvaluator(featuresCol='textTokens_vecfull', \
metricName='silhouette', distanceMeasure='squaredEuclidean')

for K in range(2,11):
  BKMeans_= BisectingKMeans(featuresCol='textTokens_vecfull', k=K)
  BKMeans_fit=BKMeans_.fit(data_cluster)
  BKMeans_transform=BKMeans_fit.transform(data_cluster) 
  evaluation_score=evaluator.evaluate(BKMeans_transform)
  silhouette_scores.append(evaluation_score)
In [ ]:
plt.figure(figsize=(10, 3))
plt.plot(range(2,11), silhouette_scores)
plt.title('Silhouette Scores for Bisecting K-Means')
plt.xlabel('Number of Clusters')
plt.savefig('BK-Means_silhouette.png', dpi=600)
In [ ]:
bkm = BisectingKMeans(featuresCol='textTokens_vecfull').setK(5).setSeed(1)
modelbk = bkm.fit(data_cluster)

# Make predictions
predictions = modelbk.transform(data_cluster)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator(featuresCol='textTokens_vecfull')

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
print("Cluster Centers: ")
centers = modelbk.clusterCenters()
for center in centers:
    print(center)
Silhouette with squared euclidean distance = -0.031550932092296785
Cluster Centers: 
[0.47538434 1.27929652 0.50583239 ... 0.         0.         0.        ]
[2.71748118 1.137284   1.27106965 ... 0.         0.         0.00010601]
[0.62811555 5.75899468 1.165815   ... 0.00010429 0.00010429 0.        ]
[11.03714383  2.56987788  2.15535957 ...  0.          0.
  0.        ]
[9.24487259 9.41330019 3.60596644 ... 0.         0.         0.        ]
In [ ]:
# Viewing the label assigned to each record belonging to which cluster
data_cluster_label = modelbk.transform(data_cluster)
data_cluster_label.show(5)
+---+--------------------+----------+
| id|  textTokens_vecfull|prediction|
+---+--------------------+----------+
|  0|(114671,[0,2,4,5,...|         3|
|  1|(114671,[0,1,2,8,...|         1|
|  2|(114671,[0,4,5,12...|         1|
|  3|(114671,[0,2,4,7,...|         3|
|  4|(114671,[0,1,4,5,...|         1|
+---+--------------------+----------+
only showing top 5 rows

In [ ]:
data_cluster_label = data_cluster_label.join(data_nlp, 'id')
data_cluster_label = data_cluster_label.select('id', 'textTokens', 'prediction')
data_cluster_label.show(5)
+----+--------------------+----------+
|  id|          textTokens|prediction|
+----+--------------------+----------+
|  26|[ronald, reagan, ...|         1|
|  29|[sen, al, franken...|         1|
| 474|[u, sen, ted, cru...|         2|
| 964|[say, people, are...|         1|
|1677|[seem, democrats,...|         1|
+----+--------------------+----------+
only showing top 5 rows

In [ ]:
# Separating clusters and counting them
dk0 = data_cluster_label.filter(data_cluster_label.prediction == 0)
dk1 = data_cluster_label.filter(data_cluster_label.prediction == 1)
dk2 = data_cluster_label.filter(data_cluster_label.prediction == 2)
dk3 = data_cluster_label.filter(data_cluster_label.prediction == 3)
dk4 = data_cluster_label.filter(data_cluster_label.prediction == 4)

print('Amount of records in cluster 0:', dk0.count())
print('Amount of records in cluster 1:', dk1.count())
print('Amount of records in cluster 2:', dk2.count())
print('Amount of records in cluster 3:', dk3.count())
print('Amount of records in cluster 4:', dk4.count())
Amount of records in cluster 0: 16719
Amount of records in cluster 1: 9431
Amount of records in cluster 2: 9589
Amount of records in cluster 3: 5896
Amount of records in cluster 4: 3218
In [ ]:
# Collecting all tokens from the previous dataset in order to put them into a single string 
# and visualize a wordcloud with the most prominent words
words = [cl_data[0] for cl_data in dk0.select('textTokens').collect()]
full_words = []
for i in range(len(words)):
    full_words.append(" ".join(words[i]))

wordcloud_words = " ".join(full_words)


# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .collect()

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))


plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(wordcloud_words)
plt.axis('off')
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f65b3705c10>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))

# Word Frequency Full dataset
plt.bar(word_frequency[1], word_frequency[0]) 
plt.title("Word Frequency in full dataset")
plt.xticks(rotation=45, ha="right")
plt.show()
In [ ]:
# Collecting all tokens from the previous dataset in order to put them into a single string 
# and visualize a wordcloud with the most prominent words
words = [cl_data[0] for cl_data in dk1.select('textTokens').collect()]
full_words = []
for i in range(len(words)):
    full_words.append(" ".join(words[i]))

wordcloud_words = " ".join(full_words)


# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .collect()

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))


plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(wordcloud_words)
plt.axis('off')
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f659c2bbdf0>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))

# Word Frequency Full dataset
plt.bar(word_frequency[1], word_frequency[0]) 
plt.title("Word Frequency in full dataset")
plt.xticks(rotation=45, ha="right")
plt.show()
In [ ]:
# Collecting all tokens from the previous dataset in order to put them into a single string 
# and visualize a wordcloud with the most prominent words
words = [cl_data[0] for cl_data in dk2.select('textTokens').collect()]
full_words = []
for i in range(len(words)):
    full_words.append(" ".join(words[i]))

wordcloud_words = " ".join(full_words)


# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .collect()

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))


plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(wordcloud_words)
plt.axis('off')
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f659b4b7670>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))

# Word Frequency Full dataset
plt.bar(word_frequency[1], word_frequency[0]) 
plt.title("Word Frequency in full dataset")
plt.xticks(rotation=45, ha="right")
plt.show()
In [ ]:
# Collecting all tokens from the previous dataset in order to put them into a single string 
# and visualize a wordcloud with the most prominent words
words = [cl_data[0] for cl_data in dk3.select('textTokens').collect()]
full_words = []
for i in range(len(words)):
    full_words.append(" ".join(words[i]))

wordcloud_words = " ".join(full_words)


# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .collect()

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))


plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(wordcloud_words)
plt.axis('off')
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f65b39e1370>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))

# Word Frequency Full dataset
plt.bar(word_frequency[1], word_frequency[0]) 
plt.title("Word Frequency in full dataset")
plt.xticks(rotation=45, ha="right")
plt.show()
In [ ]:
# Collecting all tokens from the previous dataset in order to put them into a single string 
# and visualize a wordcloud with the most prominent words
words = [cl_data[0] for cl_data in dk4.select('textTokens').collect()]
full_words = []
for i in range(len(words)):
    full_words.append(" ".join(words[i]))

wordcloud_words = " ".join(full_words)


# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .collect()

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))


plt.figure(figsize=(10,10))
wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          collocations=False,
                          width=1200,
                          height=1000
                         ).generate(wordcloud_words)
plt.axis('off')
plt.imshow(wordcloud)
Out[ ]:
<matplotlib.image.AxesImage at 0x7f659a3fb040>
In [ ]:
# Putting together all lists of fake words to form a single list containing occurrences of words
full_words_list = []
for i in range(len(words)):
    full_words_list += words[i]

# Creating an RDD with words
full_words_rdd = sc.parallelize(full_words_list)

# Counting occurences for each word
wordcount = full_words_rdd.flatMap(lambda x: x.split(' ')) \
                          .map(lambda x: (x,1)) \
                          .reduceByKey(lambda x,y:x+y) \
                          .map(lambda x: (x[1], x[0])) \
                          .sortByKey(False) \
                          .take(10)

# Turning list of tuples previously obtained into two separate lists
word_frequency = list(map(list, zip(*wordcount)))

# Word Frequency Full dataset
plt.bar(word_frequency[1], word_frequency[0]) 
plt.title("Word Frequency in full dataset")
plt.xticks(rotation=45, ha="right")
plt.show()

2.3 LATENT DIRICHLET ALLOCATION¶

In [ ]:
# Selecting the vectorized text variable
data_cluster = data_nlp_vec.select('id','textTokens_vec')
data_cluster.show(5)
+---+--------------------+
| id|      textTokens_vec|
+---+--------------------+
|  0|(3024,[0,2,4,5,7,...|
|  1|(3024,[0,1,2,8,10...|
|  2|(3024,[0,4,5,12,1...|
|  3|(3024,[0,2,4,7,11...|
|  4|(3024,[0,1,4,5,12...|
+---+--------------------+
only showing top 5 rows

In [ ]:
perplexity=[]

for K in tqdm(range(2,6)):
    lda_=LDA(k=K, maxIter=10, featuresCol = 'textTokens_vec')
    lda_fit=lda_.fit(data_cluster) 
    evaluation_score=lda_fit.logPerplexity(data_cluster)
    perplexity.append(str(evaluation_score))
100%|██████████| 4/4 [1:03:22<00:00, 950.71s/it] 
In [ ]:
perplexity
Out[ ]:
['8.638651618782573',
 '8.608240995789902',
 '8.658778034781829',
 '8.67310653133198']
In [ ]:
fig, ax = plt.subplots(1,1, figsize =(10,8))
ax.plot(range(2,6),perplexity)
ax.set_xlabel('Number of Clusters')
ax.set_ylabel('Perplexity')
Out[ ]:
Text(0, 0.5, 'Perplexity')
In [ ]:
lda = LDA(k=30, maxIter=100, featuresCol = 'textTokens_vec')
modellda = lda.fit(data_cluster)

ll = modellda.logLikelihood(data_cluster)
lp = modellda.logPerplexity(data_cluster)
In [ ]:
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
vocab = cv_model.vocabulary
topicslda = modellda.describeTopics()
t = modellda.describeTopics().collect()
topic_inds = [ind.termIndices for ind in t]

topicsl = []
for topic in topic_inds:
  _topic = []
  for ind in topic:
    _topic.append(vocab[ind]) 
  topicsl.append(_topic)

# Shows the result
transformedlda = modellda.transform(data_cluster).select('id','topicDistribution')
transformedlda.show(5)
The lower bound on the log likelihood of the entire corpus: -55539719.189070895
The upper bound on perplexity: 7.059980031249009
+---+--------------------+
| id|   topicDistribution|
+---+--------------------+
|  0|[1.46681296457461...|
|  1|[2.11550528661767...|
|  2|[1.32675586786966...|
|  3|[1.48170958991239...|
|  4|[1.95925599661544...|
+---+--------------------+
only showing top 5 rows

In [ ]:
for i, topic in enumerate(topicsl, start=1):
  print(f"topic {i}: {topic}")
topic 1: ['said', 'eu', 'britain', 'minister', 'may', 'european', 'prime', 'british', 'london', 'brexit']
topic 2: ['senate', 'bill', 'said', 'republicans', 'house', 'republican', 'health', 'legislation', 'obamacare', 'democrats']
topic 3: ['trump', 'said', 'u', 'president', 'house', 'russia', 'russian', 'white', 'former', 'washington']
topic 4: ['sanders', 'campaign', 'cruz', 'clinton', 'presidential', 'bernie', 'said', 'democratic', 'senator', 'candidate']
topic 5: ['fbi', 'information', 'investigation', 'said', 'department', 'emails', 'email', 'documents', 'clinton', 'evidence']
topic 6: ['u', 'said', 'united', 'israel', 'states', 'trump', 'president', 'mexico', 'jerusalem', 'countries']
topic 7: ['court', 'said', 'u', 'law', 'trump', 'order', 'judge', 'federal', 'states', 'supreme']
topic 8: ['said', 'reuters', 'state', 'government', 'u', 'military', 'syria', 'al', 'islamic', 'saudi']
topic 9: ['workers', 'first', 'work', 'jobs', 'food', 'president', 'labor', 'wage', 'pay', 'housing']
topic 10: ['news', 'media', 'twitter', 'com', 'story', 'fox', 'pic', 'cnn', 'fake', 'facebook']
topic 11: ['pelosi', 'nancy', 'strange', 'leader', 'chuck', 'minority', 'schumer', 'democrats', 'said', 'democrat']
topic 12: ['said', 'north', 'china', 'u', 'korea', 'nuclear', 'united', 'trump', 'states', 'south']
topic 13: ['clinton', 'hillary', 'state', 'foundation', 'bill', 'secretary', 'email', 'department', 'campaign', 'former']
topic 14: ['military', 'd', 'air', 'b', 'force', 'said', 'f', 'c', 'flight', 'plane']
topic 15: ['black', 'people', 'white', 'women', 'america', 'obama', 'american', 'muslim', 'like', 'one']
topic 16: ['police', 'said', 'one', 'year', 'people', 'old', 'school', 'told', 'two', 'man']
topic 17: ['obama', 'president', 'iran', 'deal', 'us', 'world', 'war', 'states', 'united', 'american']
topic 18: ['gun', 'people', 'one', 'right', 'law', 'guns', 'like', 'government', 're', 'via']
topic 19: ['religious', 'drug', 'gay', 'sex', 'marriage', 'church', 'christian', 'drugs', 'religion', 'rights']
topic 20: ['tax', 'said', 'percent', 'u', 'billion', 'year', 'trump', 'new', 'companies', 'plan']
topic 21: ['us', 'wire', 'fbi', 'comey', 'st', 'century', 'cia', 'russia', 'intelligence', 'media']
topic 22: ['ryan', 'house', 'paul', 'speaker', 'republican', 'said', 'abortion', 'rep', 'r', 'congress']
topic 23: ['trump', 'donald', 'president', 'people', 'like', 'one', 'twitter', 'said', 're', 'image']
topic 24: ['said', 'germany', 'refugees', 'party', 'merkel', 'government', 'german', 'europe', 'coalition', 'european']
topic 25: ['said', 'trump', 'tax', 'u', 'reuters', 'percent', 'house', 'new', 'republican', 'president']
topic 26: ['climate', 'change', 'water', 'energy', 'state', 'environmental', 'public', 'global', 'oil', 'coal']
topic 27: ['party', 'election', 'trump', 'said', 'republican', 'vote', 'percent', 'voters', 'presidential', 'president']
topic 28: ['wow', 'touch', 'vote', 'adopted', 'compromise', 'said', 'establish', 'consensus', 'u', 'violations']
topic 29: ['turkey', 'iraq', 'said', 'kurdish', 'independence', 'iraqi', 'government', 'turkish', 'region', 'referendum']
topic 30: ['million', 'said', 'state', 'money', 'city', 'new', 'governor', 'office', 'year', 'funds']

3.0 CLASSIFICATION¶

In [ ]:
def getAP(y_test, y_proba):
  AP = average_precision_score(y_test, y_proba, average='weighted')
  return np.round(AP, 4)

def getAUC(y_test, y_proba):
  AUC = roc_auc_score(y_test, y_proba, average='weighted')
  return np.round(AUC, 4)
In [ ]:
# Putting together a vector for classification
assembler = VectorAssembler(inputCols=['titleTokens_vec',
                                       'subject_vec',
                                       'date_vec'], 
                            outputCol="clf_vector")
data_clf = assembler.transform(data_nlp_vec)
data_clf.show(5)
+--------------------+--------------------+-------------+------------------+--------------------+----+--------------------+
|     titleTokens_vec|      textTokens_vec|  subject_vec|          date_vec|       full_features|fake|          clf_vector|
+--------------------+--------------------+-------------+------------------+--------------------+----+--------------------+
|(88,[0,9,24],[1.0...|(3024,[0,2,4,5,7,...|(6,[0],[1.0])|[2017.0,12.0,31.0]|(3121,[0,9,24,88,...|   1|(97,[0,9,24,88,94...|
|(88,[0,43],[1.0,1...|(3024,[0,1,2,8,10...|(6,[0],[1.0])|[2017.0,12.0,31.0]|(3121,[0,43,88,89...|   1|(97,[0,43,88,94,9...|
|     (88,[51],[1.0])|(3024,[0,4,5,12,1...|(6,[0],[1.0])|[2017.0,12.0,30.0]|(3121,[51,88,92,9...|   1|(97,[51,88,94,95,...|
|(88,[0,4],[1.0,1.0])|(3024,[0,2,4,7,11...|(6,[0],[1.0])|[2017.0,12.0,29.0]|(3121,[0,4,88,90,...|   1|(97,[0,4,88,94,95...|
|(88,[0,24,64],[1....|(3024,[0,1,4,5,12...|(6,[0],[1.0])|[2017.0,12.0,25.0]|(3121,[0,24,64,88...|   1|(97,[0,24,64,88,9...|
+--------------------+--------------------+-------------+------------------+--------------------+----+--------------------+
only showing top 5 rows

In [ ]:
# Splitting training and testing data
(data_train, data_test) = data_clf.select('clf_vector','fake').randomSplit([0.7, 0.3], seed = 42)
print('Number of training records: ', data_train.count())
print('Number of testing records: ', data_test.count())
data_test.show(5)
Number of training records:  31567
Number of testing records:  13286
+--------------------+----+
|          clf_vector|fake|
+--------------------+----+
|(97,[0,1,2,4,12,4...|   1|
|(97,[0,1,2,11,36,...|   1|
|(97,[0,1,2,14,38,...|   1|
|(97,[0,1,2,14,38,...|   1|
|(97,[0,1,2,29,88,...|   1|
+--------------------+----+
only showing top 5 rows

3.1 DECISION TREE¶

In [ ]:
# Creating classifier with default parameters
dec_tree = DecisionTreeClassifier(featuresCol='clf_vector',labelCol='fake').fit(data_train)
predictions = dec_tree.transform(data_test)
predictions.show(5)
+--------------------+----+-------------+--------------------+----------+
|          clf_vector|fake|rawPrediction|         probability|prediction|
+--------------------+----+-------------+--------------------+----------+
|(97,[0,1,2,4,12,4...|   1|   [1.0,86.0]|[0.01149425287356...|       1.0|
|(97,[0,1,2,11,36,...|   1|   [1.0,86.0]|[0.01149425287356...|       1.0|
|(97,[0,1,2,14,38,...|   1|    [1.0,1.0]|           [0.5,0.5]|       0.0|
|(97,[0,1,2,14,38,...|   1|    [1.0,1.0]|           [0.5,0.5]|       0.0|
|(97,[0,1,2,29,88,...|   1|   [1.0,86.0]|[0.01149425287356...|       1.0|
+--------------------+----+-------------+--------------------+----------+
only showing top 5 rows

In [ ]:
# Getting predictions and correct label to later pass as rdd to get classifier metrics
preds_and_labels = predictions.select('prediction', 'fake').withColumn("prediction",col("prediction").cast('float')) \
                                                           .withColumn("fake",col("fake").cast('float'))

bin_metrics = BinaryClassificationMetrics(preds_and_labels.rdd.map(tuple))
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

# Again collecting predictions and true label, this time as lists, to use for example in confusion matrix
# Also collecting probabilities to be used in ROC curve
y_pred = predictions.select('prediction').rdd.keys().collect()
y_proba = predictions.select(vector_to_array("probability")[1]).rdd.keys().collect()
y_test = predictions.select("fake").rdd.keys().collect()
/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:157: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
In [ ]:
# Metrics report
print('Accuracy:', np.round(metrics.accuracy ,4))
print('AUC:', np.round(bin_metrics.areaUnderROC ,4))
print('AP:', np.round(bin_metrics.areaUnderPR ,4), '\n')

print('Precision for class 0:', np.round(metrics.precision(0.0) ,4))
print('Precision for class 1:', np.round(metrics.precision(1.0) ,4))
print('Weighted Precision:', np.round(metrics.weightedPrecision ,4), '\n')

print('Recall for class 0:', np.round(metrics.recall(0.0) ,4))
print('Recall for class 1:', np.round(metrics.recall(1.0) ,4))
print('Weighted Recall:', np.round(metrics.weightedRecall ,4), '\n')

print('F-score for class 0:', np.round(metrics.fMeasure(0.0) ,4))
print('F-score for class 1:', np.round(metrics.fMeasure(1.0) ,4))
print('Weighted F-score:', np.round(metrics.weightedFMeasure(), 4))
Accuracy: 0.9238
AUC: 0.9268
AP: 0.9553 

Precision for class 0: 0.8704
Precision for class 1: 0.9858
Weighted Precision: 0.931 

Recall for class 0: 0.9862
Recall for class 1: 0.8674
Weighted Recall: 0.9238 

F-score for class 0: 0.9247
F-score for class 1: 0.9228
Weighted F-score: 0.9237
In [ ]:
# Plotting confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap="Blues")
plt.title('Confusion Matrix for Decision Tree')
plt.savefig('CM-DT.png', dpi=600) 
plt.show()
In [ ]:
# Getting false and true positive rate to plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

plt.plot(fpr, tpr, label='DT AUC = '+str(np.round(bin_metrics.areaUnderROC,4)))
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve for Decision Tree')  
plt.legend(loc='lower right')
# plt.savefig('CM.png', dpi=600) 
plt.show()
In [ ]:
# Getting precision and recall to plot PR curve
prec, rec, thresholds = precision_recall_curve(y_test, y_proba)

plt.plot(prec, rec, label='DT AP = ' + str(np.round(bin_metrics.areaUnderPR,4)))
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.title('PR Curve for Decision Tree') 
plt.legend(loc='lower left')
# plt.savefig('CM.png', dpi=600) 
plt.show()

3.2 LOGISTIC REGRESSION¶

In [ ]:
log_reg = LogisticRegression(featuresCol='clf_vector', labelCol='fake').fit(data_train)
predictions = log_reg.transform(data_test)
predictions.show(5)
+--------------------+----+--------------------+--------------------+----------+
|          clf_vector|fake|       rawPrediction|         probability|prediction|
+--------------------+----+--------------------+--------------------+----------+
|(97,[0,1,2,4,12,4...|   1|[-5.0605105447457...|[0.00630234912869...|       1.0|
|(97,[0,1,2,11,36,...|   1|[-2.2448719804033...|[0.09579271612831...|       1.0|
|(97,[0,1,2,14,38,...|   1|[-9.0465543177697...|[1.17782352489219...|       1.0|
|(97,[0,1,2,14,38,...|   1|[-28.515702804197...|[4.12845565952549...|       1.0|
|(97,[0,1,2,29,88,...|   1|[-7.9252034714982...|[3.61385637293547...|       1.0|
+--------------------+----+--------------------+--------------------+----------+
only showing top 5 rows

In [ ]:
beta = np.sort(log_reg.coefficients)
plt.plot(beta)
plt.title('Beta Coefficients')
plt.ylabel('β')
plt.show()
In [ ]:
# Getting predictions and correct label to later pass as rdd to get classifier metrics
preds_and_labels = predictions.select('prediction', 'fake').withColumn("prediction",col("prediction").cast('float')) \
                                                           .withColumn("fake",col("fake").cast('float'))

bin_metrics = BinaryClassificationMetrics(preds_and_labels.rdd.map(tuple))
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

# Again collecting predictions and true label, this time as lists, to use for example in confusion matrix
# Also collecting probabilities to be used in ROC curve
y_pred = predictions.select('prediction').rdd.keys().collect()
y_proba = predictions.select(vector_to_array("probability")[1]).rdd.keys().collect()
y_test = predictions.select("fake").rdd.keys().collect()
/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:157: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
In [ ]:
# Metrics report
print('Accuracy:', np.round(metrics.accuracy ,4))
print('AUC:', np.round(bin_metrics.areaUnderROC ,4))
print('AP:', np.round(bin_metrics.areaUnderPR ,4), '\n')

print('Precision for class 0:', np.round(metrics.precision(0.0) ,4))
print('Precision for class 1:', np.round(metrics.precision(1.0) ,4))
print('Weighted Precision:', np.round(metrics.weightedPrecision ,4), '\n')

print('Recall for class 0:', np.round(metrics.recall(0.0) ,4))
print('Recall for class 1:', np.round(metrics.recall(1.0) ,4))
print('Weighted Recall:', np.round(metrics.weightedRecall ,4), '\n')

print('F-score for class 0:', np.round(metrics.fMeasure(0.0) ,4))
print('F-score for class 1:', np.round(metrics.fMeasure(1.0) ,4))
print('Weighted F-score:', np.round(metrics.weightedFMeasure(), 4))
Accuracy: 0.9275
AUC: 0.9287
AP: 0.9345 

Precision for class 0: 0.9003
Precision for class 1: 0.955
Weighted Precision: 0.929 

Recall for class 0: 0.9527
Recall for class 1: 0.9048
Weighted Recall: 0.9275 

F-score for class 0: 0.9258
F-score for class 1: 0.9292
Weighted F-score: 0.9276
In [ ]:
evaluator = BinaryClassificationEvaluator(labelCol="fake")
evaluator.evaluate(predictions)
Out[ ]:
0.9706750575628119
In [ ]:
# Plotting confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap="Blues")
plt.title('Confusion Matrix for Logistic Regression')
plt.savefig('CM-LR.png', dpi=600) 
plt.show()
In [ ]:
# Getting false and true positive rate to plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

plt.plot(fpr, tpr, label='LR AUC = '+str(np.round(bin_metrics.areaUnderROC,4)))
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve for Logistic Regression')  
plt.legend(loc='lower right')
# plt.savefig('CM.png', dpi=600) 
plt.show()
In [ ]:
# Getting precision and recall to plot PR curve
prec, rec, thresholds = precision_recall_curve(y_test, y_proba)

plt.plot(prec, rec, label='LR AP = ' + str(np.round(bin_metrics.areaUnderPR,4)))
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.title('PR Curve for Logistic Regression') 
plt.legend(loc='lower left')
# plt.savefig('CM.png', dpi=600) 
plt.show()

3.3 SUPPORT VECTOR MACHINE¶

In [ ]:
svc = LinearSVC(featuresCol='clf_vector', labelCol='fake').fit(data_train)
predictions = svc.transform(data_test)
predictions.show(5)
+--------------------+----+--------------------+----------+
|          clf_vector|fake|       rawPrediction|prediction|
+--------------------+----+--------------------+----------+
|(97,[0,1,2,4,12,4...|   1|[-3.4991893156379...|       1.0|
|(97,[0,1,2,11,36,...|   1|[-1.4571711288731...|       1.0|
|(97,[0,1,2,14,38,...|   1|[-6.4068265033188...|       1.0|
|(97,[0,1,2,14,38,...|   1|[-11.385839797132...|       1.0|
|(97,[0,1,2,29,88,...|   1|[-5.5722883965936...|       1.0|
+--------------------+----+--------------------+----------+
only showing top 5 rows

In [ ]:
beta = np.sort(svc.coefficients)
plt.plot(beta)
plt.title('Beta Coefficients')
plt.ylabel('β')
plt.show()
In [ ]:
# Getting predictions and correct label to later pass as rdd to get classifier metrics
preds_and_labels = predictions.select('prediction', 'fake').withColumn("prediction",col("prediction").cast('float')) \
                                                           .withColumn("fake",col("fake").cast('float'))

bin_metrics = BinaryClassificationMetrics(preds_and_labels.rdd.map(tuple))
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

# Again collecting predictions and true label, this time as lists, to use for example in confusion matrix
# Also collecting probabilities to be used in ROC curve
y_pred = predictions.select('prediction').rdd.keys().collect()
y_proba = predictions.select(vector_to_array("rawPrediction")[1]).rdd.keys().collect()
y_test = predictions.select("fake").rdd.keys().collect()
/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:157: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
In [ ]:
# Metrics report
print('Accuracy:', np.round(metrics.accuracy ,4))
print('AUC:', np.round(bin_metrics.areaUnderROC ,4))
print('AP:', np.round(bin_metrics.areaUnderPR ,4), '\n')

print('Precision for class 0:', np.round(metrics.precision(0.0) ,4))
print('Precision for class 1:', np.round(metrics.precision(1.0) ,4))
print('Weighted Precision:', np.round(metrics.weightedPrecision ,4), '\n')

print('Recall for class 0:', np.round(metrics.recall(0.0) ,4))
print('Recall for class 1:', np.round(metrics.recall(1.0) ,4))
print('Weighted Recall:', np.round(metrics.weightedRecall ,4), '\n')

print('F-score for class 0:', np.round(metrics.fMeasure(0.0) ,4))
print('F-score for class 1:', np.round(metrics.fMeasure(1.0) ,4))
print('Weighted F-score:', np.round(metrics.weightedFMeasure(), 4))
Accuracy: 0.9341
AUC: 0.9356
AP: 0.9466 

Precision for class 0: 0.9016
Precision for class 1: 0.9677
Weighted Precision: 0.9363 

Recall for class 0: 0.9665
Recall for class 1: 0.9048
Weighted Recall: 0.9341 

F-score for class 0: 0.9329
F-score for class 1: 0.9352
Weighted F-score: 0.9341
In [ ]:
# Plotting confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap="Blues")
plt.title('Confusion Matrix for SVC')
# plt.savefig('CM.png', dpi=600) 
plt.show()
In [ ]:
# Getting false and true positive rate to plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

plt.plot(fpr, tpr, label='SVC AUC = '+str(np.round(bin_metrics.areaUnderROC,4)))
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve for SVC')  
plt.legend(loc='lower right')
# plt.savefig('CM.png', dpi=600) 
plt.show()
In [ ]:
# Getting precision and recall to plot PR curve
prec, rec, thresholds = precision_recall_curve(y_test, y_proba)

plt.plot(prec, rec, label='SVC AP = ' + str(np.round(bin_metrics.areaUnderPR,4)))
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.title('PR Curve for SVC') 
plt.legend(loc='lower left')
# plt.savefig('CM.png', dpi=600) 
plt.show()

3.4 RANDOM FOREST¶

In [ ]:
# Creating classifier with default parameters
rand_forest = RandomForestClassifier(featuresCol='clf_vector',labelCol='fake').fit(data_train)
predictions = rand_forest.transform(data_test)
predictions.show(5)
+--------------------+----+--------------------+--------------------+----------+
|          clf_vector|fake|       rawPrediction|         probability|prediction|
+--------------------+----+--------------------+--------------------+----------+
|(97,[0,1,2,4,12,4...|   1|[9.29800237752921...|[0.46490011887646...|       1.0|
|(97,[0,1,2,11,36,...|   1|[9.30202615956936...|[0.46510130797846...|       1.0|
|(97,[0,1,2,14,38,...|   1|[4.26613876994833...|[0.21330693849741...|       1.0|
|(97,[0,1,2,14,38,...|   1|[1.39561586888289...|[0.06978079344414...|       1.0|
|(97,[0,1,2,29,88,...|   1|[4.95849658212931...|[0.24792482910646...|       1.0|
+--------------------+----+--------------------+--------------------+----------+
only showing top 5 rows

In [ ]:
# Getting predictions and correct label to later pass as rdd to get classifier metrics
preds_and_labels = predictions.select('prediction', 'fake').withColumn("prediction",col("prediction").cast('float')) \
                                                           .withColumn("fake",col("fake").cast('float'))

bin_metrics = BinaryClassificationMetrics(preds_and_labels.rdd.map(tuple))
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

# Again collecting predictions and true label, this time as lists, to use for example in confusion matrix
# Also collecting probabilities to be used in ROC curve
y_pred = predictions.select('prediction').rdd.keys().collect()
y_proba = predictions.select(vector_to_array("probability")[1]).rdd.keys().collect()
y_test = predictions.select("fake").rdd.keys().collect()
/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:157: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
In [ ]:
# Metrics report
print('Accuracy:', np.round(metrics.accuracy ,4))
print('AUC:', np.round(bin_metrics.areaUnderROC ,4))
print('AP:', np.round(bin_metrics.areaUnderPR ,4), '\n')

print('Precision for class 0:', np.round(metrics.precision(0.0) ,4))
print('Precision for class 1:', np.round(metrics.precision(1.0) ,4))
print('Weighted Precision:', np.round(metrics.weightedPrecision ,4), '\n')

print('Recall for class 0:', np.round(metrics.recall(0.0) ,4))
print('Recall for class 1:', np.round(metrics.recall(1.0) ,4))
print('Weighted Recall:', np.round(metrics.weightedRecall ,4), '\n')

print('F-score for class 0:', np.round(metrics.fMeasure(0.0) ,4))
print('F-score for class 1:', np.round(metrics.fMeasure(1.0) ,4))
print('Weighted F-score:', np.round(metrics.weightedFMeasure(), 4))
Accuracy: 0.9312
AUC: 0.9338
AP: 0.958 

Precision for class 0: 0.8834
Precision for class 1: 0.985
Weighted Precision: 0.9368 

Recall for class 0: 0.9851
Recall for class 1: 0.8826
Weighted Recall: 0.9312 

F-score for class 0: 0.9315
F-score for class 1: 0.931
Weighted F-score: 0.9312
In [ ]:
# Plotting confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap="Blues")
plt.title('Confusion Matrix for Random Forest')
# plt.savefig('CM.png', dpi=600) 
plt.show()
In [ ]:
# Getting false and true positive rate to plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

plt.plot(fpr, tpr, label='RF AUC = '+str(np.round(bin_metrics.areaUnderROC,4)))
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve for Random Forest')  
plt.legend(loc='lower right')
# plt.savefig('CM.png', dpi=600) 
plt.show()
In [ ]:
# Getting precision and recall to plot PR curve
prec, rec, thresholds = precision_recall_curve(y_test, y_proba)

plt.plot(prec, rec, label='RF AP = ' + str(np.round(bin_metrics.areaUnderPR,4)))
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.title('PR Curve for Random Forest') 
plt.legend(loc='lower left')
# plt.savefig('CM.png', dpi=600) 
plt.show()

3.5 COMPARING CLASSIFIERS¶

In [ ]:
model_dt = DecisionTreeClassifier(featuresCol='clf_vector', labelCol='fake').fit(data_train)
model_logreg = LogisticRegression(featuresCol='clf_vector', labelCol='fake').fit(data_train)
model_svc = LinearSVC(featuresCol='clf_vector', labelCol='fake').fit(data_train)
model_rfc = RandomForestClassifier(featuresCol='clf_vector', labelCol='fake').fit(data_train)

models = {
  'Decision Tree': model_dt,
  'Logistic Regression': model_logreg,
  'Support Vector Machine': model_svc,
  'Random Forest': model_rfc,
  }
In [ ]:
y_pred = {}
y_proba = {}
y_test = {}

for name, model in models.items():
  print(name, 'metrics:')
  predictions = model.transform(data_test)

  preds_and_labels = predictions.select('prediction', 'fake').withColumn("prediction",col("prediction").cast('float')) \
                                                             .withColumn("fake",col("fake").cast('float'))
  metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

  y_pred[name] = predictions.select('prediction').rdd.keys().collect()
  if "proability" in predictions.columns:
    y_proba[name] = predictions.select(vector_to_array("probability")[1]).rdd.keys().collect()
  else:
    y_proba[name] = predictions.select(vector_to_array("rawPrediction")[1]).rdd.keys().collect()
  y_test[name] = predictions.select("fake").rdd.keys().collect()

  print('Accuracy:', np.round(metrics.accuracy ,4))
  print('Precision:', np.round(metrics.weightedPrecision ,4))
  print('Recall:', np.round(metrics.weightedRecall ,4))
  print('F-score:', np.round(metrics.weightedFMeasure(), 4), '\n')
Decision Tree metrics:
/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:157: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
Accuracy: 0.9238
Precision: 0.931
Recall: 0.9238
F-score: 0.9237 

Logistic Regression metrics:
Accuracy: 0.9275
Precision: 0.929
Recall: 0.9275
F-score: 0.9276 

Support Vector Machine metrics:
Accuracy: 0.9341
Precision: 0.9363
Recall: 0.9341
F-score: 0.9341 

Random Forest metrics:
Accuracy: 0.9312
Precision: 0.9368
Recall: 0.9312
F-score: 0.9312 

In [ ]:
fpr, tpr, thresholds = roc_curve(y_test['Decision Tree'], y_proba['Decision Tree'])
fpr2, tpr2, thresholds2 = roc_curve(y_test['Logistic Regression'], y_proba['Logistic Regression'])
fpr3, tpr3, thresholds3 = roc_curve(y_test['Support Vector Machine'], y_proba['Support Vector Machine'])
fpr4, tpr4, thresholds4 = roc_curve(y_test['Random Forest'], y_proba['Random Forest'])

plt.plot(fpr, tpr, color='olivedrab', label='DT AUC = ' + 
         str(getAUC(y_test['Decision Tree'], y_pred['Decision Tree'])))
plt.plot(fpr2, tpr2, color='turquoise', label='LR AUC = ' + 
         str(getAUC(y_test['Logistic Regression'], y_pred['Logistic Regression'])))
plt.plot(fpr3, tpr3, color='darkorange', label='SVC AUC = ' + 
         str(getAUC(y_test['Support Vector Machine'], y_pred['Support Vector Machine'])))
plt.plot(fpr4, tpr4, color='navy', label='RF AUC = ' + 
         str(getAUC(y_test['Random Forest'], y_pred['Random Forest'])))
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve - Comparison across models')  
plt.legend(loc='lower right')
plt.savefig('ComparisonModelsROC.png', dpi=600) 
plt.show()
In [ ]:
# Getting precision and recall to plot PR curve
prec, rec, thresholds = precision_recall_curve(y_test['Decision Tree'], y_proba['Decision Tree'])
prec2, rec2, thresholds2 = precision_recall_curve(y_test['Logistic Regression'], y_proba['Logistic Regression'])
prec3, rec3, thresholds3 = precision_recall_curve(y_test['Support Vector Machine'], y_proba['Support Vector Machine'])
prec4, rec4, thresholds4 = precision_recall_curve(y_test['Random Forest'], y_proba['Random Forest'])

plt.plot(prec, rec, color='olivedrab', label='DT AP = ' + 
         str(getAUC(y_test['Decision Tree'], y_pred['Decision Tree'])))
plt.plot(prec2, rec2, color='turquoise', label='LR AP = ' + 
         str(getAUC(y_test['Logistic Regression'], y_pred['Logistic Regression'])))
plt.plot(prec3, rec3, color='darkorange', label='SVC AP = ' + 
         str(getAUC(y_test['Support Vector Machine'], y_pred['Support Vector Machine'])))
plt.plot(prec4, rec4, color='navy', label='RF AP = ' + 
         str(getAUC(y_test['Random Forest'], y_pred['Random Forest'])))
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.title('PR Curve - Comparison across models') 
plt.legend(loc='lower left')
plt.savefig('ComparisonModelsPR.png', dpi=600) 
plt.show()

3.6 COMPARING TRAINING VECTORS¶

In [ ]:
# Splitting training and testing data
(data_train, data_test) = data_clf.randomSplit([0.7, 0.3], seed = 42)
print('Number of training records: ', data_train.count())
print('Number of testing records: ', data_test.count())
data_test.show(5)
Number of training records:  31567
Number of testing records:  13286
+---------------+--------------------+-------------+------------------+--------------------+----+--------------------+
|titleTokens_vec|      textTokens_vec|  subject_vec|          date_vec|       full_features|fake|          clf_vector|
+---------------+--------------------+-------------+------------------+--------------------+----+--------------------+
|     (88,[],[])|        (3024,[],[])|(6,[1],[1.0])| [2017.0,1.0,20.0]|(3121,[3113,3118,...|   1|(97,[89,94,95,96]...|
|     (88,[],[])|        (3024,[],[])|(6,[2],[1.0])| [2017.0,3.0,30.0]|(3121,[3114,3118,...|   1|(97,[90,94,95,96]...|
|     (88,[],[])|(3024,[0,1,2,3,4,...|(6,[1],[1.0])| [2016.0,1.0,12.0]|(3121,[88,89,90,9...|   1|(97,[89,94,95,96]...|
|     (88,[],[])|(3024,[0,1,2,3,4,...|(6,[2],[1.0])| [2016.0,1.0,12.0]|(3121,[88,89,90,9...|   1|(97,[90,94,95,96]...|
|     (88,[],[])|(3024,[0,1,2,3,4,...|(6,[5],[1.0])|[2017.0,12.0,28.0]|(3121,[88,89,90,9...|   1|(97,[93,94,95,96]...|
+---------------+--------------------+-------------+------------------+--------------------+----+--------------------+
only showing top 5 rows

In [ ]:
model_svc = LinearSVC(featuresCol='clf_vector', labelCol='fake').fit(data_train)
model_svc2 = LinearSVC(featuresCol='titleTokens_vec', labelCol='fake').fit(data_train)
model_svc3 = LinearSVC(featuresCol='textTokens_vec', labelCol='fake').fit(data_train)
model_svc4 = LinearSVC(featuresCol='subject_vec', labelCol='fake').fit(data_train)
model_svc5 = LinearSVC(featuresCol='date_vec', labelCol='fake').fit(data_train)
model_svc6 = LinearSVC(featuresCol='full_features', labelCol='fake').fit(data_train)

models = {
  'SVC': model_svc,
  'SVC2': model_svc2,
  'SVC3': model_svc3,
  'SVC4': model_svc4,
  'SVC5': model_svc5,
  'SVC6': model_svc6,
  }
In [ ]:
y_pred2 = {}
y_proba2 = {}
y_test2 = {}
AUC2 = {}
AP2 = {}

for name, model in models.items():
  print(name, 'metrics')
  predictions = model.transform(data_test)

  preds_and_labels = predictions.select('prediction', 'fake').withColumn("prediction",col("prediction").cast('float')) \
                                                             .withColumn("fake",col("fake").cast('float'))

  metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

  y_pred2[name] = predictions.select('prediction').rdd.keys().collect()
  if "proability" in predictions.columns:
    y_proba2[name] = predictions.select(vector_to_array("probability")[1]).rdd.keys().collect()
  else:
    y_proba2[name] = predictions.select(vector_to_array("rawPrediction")[1]).rdd.keys().collect()
  y_test2[name] = predictions.select("fake").rdd.keys().collect()

  print('Accuracy:', np.round(metrics.accuracy ,4))
  print('Precision:', np.round(metrics.weightedPrecision ,4))
  print('Recall:', np.round(metrics.weightedRecall ,4))
  print('F-score:', np.round(metrics.weightedFMeasure(), 4), '\n')
SVC metrics
/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:157: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
Accuracy: 0.936
Precision: 0.9383
Recall: 0.936
F-score: 0.9361 

SVC2 metrics
Accuracy: 0.8197
Precision: 0.8388
Recall: 0.8197
F-score: 0.8184 

SVC3 metrics
Accuracy: 0.9942
Precision: 0.9942
Recall: 0.9942
F-score: 0.9942 

SVC4 metrics
Accuracy: 0.6225
Precision: 0.6222
Recall: 0.6225
F-score: 0.6191 

SVC5 metrics
Accuracy: 0.6921
Precision: 0.7038
Recall: 0.6921
F-score: 0.6904 

SVC6 metrics
Accuracy: 0.9959
Precision: 0.9959
Recall: 0.9959
F-score: 0.9959 

In [ ]:
fpr, tpr, thresholds = roc_curve(y_test2['SVC'], y_proba2['SVC'])
fpr2, tpr2, thresholds2 = roc_curve(y_test2['SVC2'], y_proba2['SVC2'])
fpr3, tpr3, thresholds3 = roc_curve(y_test2['SVC3'], y_proba2['SVC3'])
fpr4, tpr4, thresholds4 = roc_curve(y_test2['SVC4'], y_proba2['SVC4'])
fpr5, tpr5, thresholds5 = roc_curve(y_test2['SVC5'], y_proba2['SVC5'])
fpr6, tpr6, thresholds6 = roc_curve(y_test2['SVC6'], y_proba2['SVC6'])

plt.plot(fpr2, tpr2, color= 'turquoise', label='Title AUC = ' + 
         str(getAUC(y_test2['SVC2'], y_pred2['SVC2'])))
plt.plot(fpr3, tpr3, color= 'darkorange', label='Text AUC = ' + 
         str(getAUC(y_test2['SVC3'], y_pred2['SVC3'])))
plt.plot(fpr4, tpr4, color= 'navy', label='Subject AUC = ' + 
         str(getAUC(y_test2['SVC4'], y_pred2['SVC4'])))
plt.plot(fpr5, tpr5, color= 'purple', label='Date AUC = ' + 
         str(getAUC(y_test2['SVC5'], y_pred2['SVC5'])))
plt.plot(fpr6, tpr6, color= 'teal', label='Full features AUC = ' + 
         str(getAUC(y_test2['SVC6'], y_pred2['SVC6'])))
plt.plot(fpr, tpr, color= 'olivedrab', label='Title+Subject+Date AUC = ' + 
         str(getAUC(y_test2['SVC'], y_pred2['SVC'])))

plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve - Comparison across input data')  
plt.legend(loc='lower right')
plt.savefig('ComparisonDataROC.png', dpi=600) 
plt.show()
In [ ]:
# Getting precision and recall to plot PR curve
prec, rec, thresholds = precision_recall_curve(y_test2['SVC'], y_proba2['SVC'])
prec2, rec2, thresholds2 = precision_recall_curve(y_test2['SVC2'], y_proba2['SVC2'])
prec3, rec3, thresholds3 = precision_recall_curve(y_test2['SVC3'], y_proba2['SVC3'])
prec4, rec4, thresholds4 = precision_recall_curve(y_test2['SVC4'], y_proba2['SVC4'])
prec5, rec5, thresholds5 = precision_recall_curve(y_test2['SVC5'], y_proba2['SVC5'])
prec6, rec6, thresholds6 = precision_recall_curve(y_test2['SVC6'], y_proba2['SVC6'])

plt.plot(prec2, rec2, color= 'turquoise', label='Ttitle AP = ' + 
         str(getAP(y_test2['SVC2'], y_pred2['SVC2'])))
plt.plot(prec3, rec3, color= 'darkorange', label='Text AP = ' + 
         str(getAP(y_test2['SVC3'], y_pred2['SVC3'])))
plt.plot(prec4, rec4, color= 'navy', label='Subject AP = ' + 
         str(getAP(y_test2['SVC4'], y_pred2['SVC4'])))
plt.plot(prec5, rec5, color= 'purple', label='Date AP = ' + 
         str(getAP(y_test2['SVC5'], y_pred2['SVC5'])))
plt.plot(prec6, rec6, color= 'teal', label='Full features AP = ' + 
         str(getAP(y_test2['SVC6'], y_pred2['SVC6'])))
plt.plot(prec, rec, color= 'olivedrab', label='Title+Subject+Date AP = ' + 
         str(getAP(y_test2['SVC'], y_pred2['SVC'])))
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.title('PR Curve - Comparison across input data') 
plt.legend(loc='lower left')
plt.savefig('ComparisonDataPR.png', dpi=600) 
plt.show()

3.7 PARAMETER HYPERTUNING¶

In [ ]:
# Splitting training and testing data
(data_train, data_test) = data_clf.select('clf_vector','fake').randomSplit([0.7, 0.3], seed = 42)
print('Number of training records: ', data_train.count())
print('Number of testing records: ', data_test.count())
data_test.show(5)
Number of training records:  31567
Number of testing records:  13286
+--------------------+----+
|          clf_vector|fake|
+--------------------+----+
|(97,[0,1,2,4,12,4...|   1|
|(97,[0,1,2,11,36,...|   1|
|(97,[0,1,2,14,38,...|   1|
|(97,[0,1,2,14,38,...|   1|
|(97,[0,1,2,29,88,...|   1|
+--------------------+----+
only showing top 5 rows

In [ ]:
dt = DecisionTreeClassifier(labelCol = "fake", featuresCol = "clf_vector")
lr = LogisticRegression(labelCol = "fake", featuresCol = "clf_vector")
svc = LinearSVC(labelCol = "fake", featuresCol = "clf_vector")
rf = RandomForestClassifier(labelCol = "fake", featuresCol = "clf_vector")

paramGrid_dt = ParamGridBuilder() \
              .addGrid(dt.maxDepth, [5,15,25]) \
              .addGrid(dt.maxBins, [10,30,50]) \
              .build()
  
paramGrid_lr = ParamGridBuilder() \
              .addGrid(lr.regParam, [.01,.1,1,10]) \
              .addGrid(lr.fitIntercept, [True, False]) \
              .addGrid(lr.elasticNetParam, [.01,.05,.1,.5]) \
              .addGrid(lr.maxIter, [100, 200, 300]) \
              .build()

paramGrid_svc = ParamGridBuilder() \
                .addGrid(svc.regParam, [.01,.1,1,10]) \
                .addGrid(svc.fitIntercept, [True, False]) \
                .addGrid(svc.standardization, [True, False]) \
                .addGrid(svc.maxIter, [100, 200, 300]) \
                .build()
  
paramGrid_rf = ParamGridBuilder() \
              .addGrid(rf.numTrees, [30,40,50,60]) \
              .addGrid(rf.maxDepth, [5,15,25,30]) \
              .build()

crossval_dt = CrossValidator(estimator = dt,
                             estimatorParamMaps = paramGrid_dt,
                             evaluator = BinaryClassificationEvaluator(labelCol = 'fake'),
                             seed = 42,
                             numFolds=5)

crossval_lr = CrossValidator(estimator = lr,
                             estimatorParamMaps = paramGrid_lr,
                             evaluator = BinaryClassificationEvaluator(labelCol = 'fake'),
                             seed = 42,
                             numFolds=5)

crossval_svc = CrossValidator(estimator = svc,
                              estimatorParamMaps = paramGrid_svc,
                              evaluator = BinaryClassificationEvaluator(labelCol = 'fake'),
                              seed = 42,
                              numFolds=5)

crossval_rf = CrossValidator(estimator = rf,
                             estimatorParamMaps = paramGrid_rf,
                             evaluator = BinaryClassificationEvaluator(labelCol = 'fake'),
                             seed = 42,
                             numFolds=5)

DECISION TREE

In [ ]:
cv_dt = crossval_dt.fit(data_train)
predictions_dt = cv_dt.transform(data_test)
predictions_dt.show(5)
+--------------------+----+-------------+-----------+----------+
|          clf_vector|fake|rawPrediction|probability|prediction|
+--------------------+----+-------------+-----------+----------+
|(97,[0,1,2,4,12,4...|   1|   [0.0,78.0]|  [0.0,1.0]|       1.0|
|(97,[0,1,2,11,36,...|   1|   [0.0,78.0]|  [0.0,1.0]|       1.0|
|(97,[0,1,2,14,38,...|   1|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|(97,[0,1,2,14,38,...|   1|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|(97,[0,1,2,29,88,...|   1|   [0.0,78.0]|  [0.0,1.0]|       1.0|
+--------------------+----+-------------+-----------+----------+
only showing top 5 rows

In [ ]:
preds_and_labels_dt = predictions_dt.select('prediction', 'fake').withColumn("prediction",col("prediction").cast('float')) \
                                                           .withColumn("fake",col("fake").cast('float'))

bin_metrics_dt = BinaryClassificationMetrics(preds_and_labels_dt.rdd.map(tuple))                                     
metrics_dt = MulticlassMetrics(preds_and_labels_dt.rdd.map(tuple))

y_pred_dt = predictions_dt.select('prediction').rdd.keys().collect()
y_proba_dt = predictions_dt.select(vector_to_array("probability")[1]).rdd.keys().collect()
y_test_dt = predictions_dt.select("fake").rdd.keys().collect()
print('Accuracy:', np.round(metrics_dt.accuracy ,4))
print('Precision:', np.round(metrics_dt.weightedPrecision ,4))
print('Recall:', np.round(metrics_dt.weightedRecall ,4))
print('F-score:', np.round(metrics_dt.weightedFMeasure(), 4))
print('AUC:', np.round(bin_metrics_dt.areaUnderROC ,4))
print('AP:', np.round(bin_metrics_dt.areaUnderPR ,4))  

cv_dt.bestModel
Accuracy: 0.9483
Precision: 0.95
Recall: 0.9483
F-score: 0.9483
AUC: 0.9497
AP: 0.9603
Out[ ]:
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_604a66b53c34, depth=25, numNodes=1469, numClasses=2, numFeatures=97

LOGISTIC REGRESSION

In [ ]:
cv_lr = crossval_lr.fit(data_train)
predictions_lr = cv_lr.transform(data_test)
predictions_lr.show(5)
+--------------------+----+--------------------+--------------------+----------+
|          clf_vector|fake|       rawPrediction|         probability|prediction|
+--------------------+----+--------------------+--------------------+----------+
|(97,[0,1,2,4,12,4...|   1|[-1.8387299999503...|[0.13720156248671...|       1.0|
|(97,[0,1,2,11,36,...|   1|[0.05923038488208...|[0.51480326869046...|       0.0|
|(97,[0,1,2,14,38,...|   1|[-4.7536360910844...|[0.00854661960133...|       1.0|
|(97,[0,1,2,14,38,...|   1|[-8.9862431758592...|[1.25103609586948...|       1.0|
|(97,[0,1,2,29,88,...|   1|[-3.8848978598798...|[0.02013613157967...|       1.0|
+--------------------+----+--------------------+--------------------+----------+
only showing top 5 rows

In [ ]:
preds_and_labels_lr = predictions_lr.select('prediction', 'fake').withColumn("prediction",col("prediction").cast('float')) \
                                                           .withColumn("fake",col("fake").cast('float'))

bin_metrics_lr = BinaryClassificationMetrics(preds_and_labels_lr.rdd.map(tuple))                                     
metrics_lr = MulticlassMetrics(preds_and_labels_lr.rdd.map(tuple))

y_pred_lr = predictions_lr.select('prediction').rdd.keys().collect()
y_proba_lr = predictions_lr.select(vector_to_array("probability")[1]).rdd.keys().collect()
y_test_lr = predictions_lr.select("fake").rdd.keys().collect()
print('Accuracy:', np.round(metrics_lr.accuracy ,4))
print('Precision:', np.round(metrics_lr.weightedPrecision ,4))
print('Recall:', np.round(metrics_lr.weightedRecall ,4))
print('F-score:', np.round(metrics_lr.weightedFMeasure(), 4))
print('AUC:', np.round(bin_metrics_lr.areaUnderROC ,4))
print('AP:', np.round(bin_metrics_lr.areaUnderPR ,4))  

cv_lr.bestModel
/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:157: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
Accuracy: 0.9225
Precision: 0.9245
Recall: 0.9225
F-score: 0.9225
AUC: 0.9239
AP: 0.9316
Out[ ]:
LogisticRegressionModel: uid=LogisticRegression_cc952a9e3afa, numClasses=2, numFeatures=97

SUPPORT VECTOR MACHINES

In [ ]:
cv_svc = crossval_svc.fit(data_train)
predictions_svc = cv_svc.transform(data_test)
predictions_svc.show(5)
+--------------------+----+--------------------+----------+
|          clf_vector|fake|       rawPrediction|prediction|
+--------------------+----+--------------------+----------+
|(97,[0,1,2,4,12,4...|   1|[-0.8020266643457...|       1.0|
|(97,[0,1,2,11,36,...|   1|[0.15726171166397...|       0.0|
|(97,[0,1,2,14,38,...|   1|[-2.2963921587193...|       1.0|
|(97,[0,1,2,14,38,...|   1|[-4.2376671373631...|       1.0|
|(97,[0,1,2,29,88,...|   1|[-1.7975925025994...|       1.0|
+--------------------+----+--------------------+----------+
only showing top 5 rows

In [ ]:
preds_and_labels_svc = predictions_svc.select('prediction', 'fake').withColumn("prediction",col("prediction").cast('float')) \
                                                           .withColumn("fake",col("fake").cast('float'))

bin_metrics_svc = BinaryClassificationMetrics(preds_and_labels_svc.rdd.map(tuple))                                     
metrics_svc = MulticlassMetrics(preds_and_labels_svc.rdd.map(tuple))

y_pred_svc = predictions_svc.select('prediction').rdd.keys().collect()
y_proba_svc = predictions_svc.select(vector_to_array("rawPrediction")[1]).rdd.keys().collect()
y_test_svc = predictions_svc.select("fake").rdd.keys().collect()
print('Accuracy:', np.round(metrics_svc.accuracy ,4))
print('Precision:', np.round(metrics_svc.weightedPrecision ,4))
print('Recall:', np.round(metrics_svc.weightedRecall ,4))
print('F-score:', np.round(metrics_svc.weightedFMeasure(), 4))
print('AUC:', np.round(bin_metrics_svc.areaUnderROC ,4))
print('AP:', np.round(bin_metrics_svc.areaUnderPR ,4))  

cv_svc.bestModel
Accuracy: 0.9152
Precision: 0.921
Recall: 0.9152
F-score: 0.9152
AUC: 0.9179
AP: 0.9398
Out[ ]:
LinearSVCModel: uid=LinearSVC_bd4fc4335a6a, numClasses=2, numFeatures=97

RANDOM FORESTS

In [ ]:
cv_rf = crossval_rf.fit(data_train)
predictions_rf = cv_rf.transform(data_test)
predictions_rf.show(5)
+--------------------+----+--------------------+--------------------+----------+
|          clf_vector|fake|       rawPrediction|         probability|prediction|
+--------------------+----+--------------------+--------------------+----------+
|(97,[0,1,2,4,12,4...|   1|[12.2957339805866...|[0.24591467961173...|       1.0|
|(97,[0,1,2,11,36,...|   1|[5.79452004360091...|[0.11589040087201...|       1.0|
|(97,[0,1,2,14,38,...|   1|[8.61089977591247...|[0.17221799551824...|       1.0|
|(97,[0,1,2,14,38,...|   1|[2.22320153132455...|[0.04446403062649...|       1.0|
|(97,[0,1,2,29,88,...|   1|[3.19053094345557...|[0.06381061886911...|       1.0|
+--------------------+----+--------------------+--------------------+----------+
only showing top 5 rows

In [ ]:
preds_and_labels_rf = predictions_rf.select('prediction', 'fake').withColumn("prediction",col("prediction").cast('float')) \
                                                           .withColumn("fake",col("fake").cast('float'))

bin_metrics_rf = BinaryClassificationMetrics(preds_and_labels_rf.rdd.map(tuple))                                     
metrics_rf = MulticlassMetrics(preds_and_labels_rf.rdd.map(tuple))

y_pred_rf = predictions_rf.select('prediction').rdd.keys().collect()
y_proba_rf = predictions_rf.select(vector_to_array("probability")[1]).rdd.keys().collect()
y_test_rf = predictions_rf.select("fake").rdd.keys().collect()
print('Accuracy:', np.round(metrics_rf.accuracy ,4))
print('Precision:', np.round(metrics_rf.weightedPrecision ,4))
print('Recall:', np.round(metrics_rf.weightedRecall ,4))
print('F-score:', np.round(metrics_rf.weightedFMeasure(), 4))
print('AUC:', np.round(bin_metrics_rf.areaUnderROC ,4))
print('AP:', np.round(bin_metrics_rf.areaUnderPR ,4))  

cv_rf.bestModel
/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:157: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
Accuracy: 0.9552
Precision: 0.9572
Recall: 0.9552
F-score: 0.9552
AUC: 0.9567
AP: 0.9702
Out[ ]:
RandomForestClassificationModel: uid=RandomForestClassifier_6aa7bbfc41e8, numTrees=50, numClasses=2, numFeatures=97

4.0 FREQUENT PATTERN MINING¶

In [ ]:
# Removing numbers and symbols to remain only with words
new_data = fake_data.withColumn("text",regexp_replace(col('text'), '\d+', ''))

# Tokenizer
regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="regToken", pattern="\\W")
new_data = regex_tokenizer.transform(new_data)

# Removing stopwords
remover = StopWordsRemover(inputCol="regToken", outputCol="tokens")
new_data = remover.transform(new_data)
data_fpm = new_data.select("tokens")
data_fpm.show(5)
+--------------------+
|              tokens|
+--------------------+
|[donald, trump, c...|
|[house, intellige...|
|[friday, revealed...|
|[christmas, day, ...|
|[pope, francis, u...|
+--------------------+
only showing top 5 rows

In [ ]:
# Obtaining unique items per observation
un_pre = [data[0] for data in data_fpm.select('tokens').collect()]
unique_val = []
num = 0
for x in un_pre:
  unique_val.append((num,list(set(x))))
  num += 1
In [ ]:
df = spark.createDataFrame(unique_val, ["id", "items"])

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.25, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show(10)
/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:125: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
+------------+-----+
|       items| freq|
+------------+-----+
|     [white]| 5935|
|      [also]| 8993|
|[also, said]| 5929|
|   [america]| 6230|
|       [one]|12319|
|[one, trump]| 7117|
| [one, said]| 7798|
|      [know]| 6552|
|      [news]| 7181|
|       [via]|11246|
+------------+-----+
only showing top 10 rows

In [ ]:
# Display generated association rules.
model.associationRules.show(10)
+----------+-----------+------------------+------------------+-------------------+
|antecedent| consequent|        confidence|              lift|            support|
+----------+-----------+------------------+------------------+-------------------+
|    [like]|      [one]| 0.631547736706859| 1.203267873061668|0.27225086276681865|
|    [like]|    [trump]|0.6058509586874876|1.1355048990939889|0.26117336287333304|
|    [like]|   [people]|0.6127693220003954|1.2579645549436962| 0.2641557666908099|
|    [like]|     [said]| 0.613757659616525|1.1271913950594254|0.26458182437902095|
|   [trump]|[president]|  0.61151481274455|1.3627861915996329| 0.3262749776319714|
|   [trump]|   [donald]|0.7249061726423381|1.8647821983875843|0.38677516935793105|
|     [via]|      [one]|0.6216432509336653| 1.184397170441112|0.29785692982829876|
|     [via]|     [said]|0.6436066156855771|1.1820102407477449|0.30838055472711007|
|     [via]| [featured]|0.6219989329539392|1.7203555214897368| 0.2980273529035831|
|     [via]|    [image]|0.6546327583140672|1.7089184151250663|0.31366367006092627|
+----------+-----------+------------------+------------------+-------------------+
only showing top 10 rows

In [ ]:
# transform examines the input items against all the association rules and summarize the consequents as prediction
model.transform(df).show(10)
+---+--------------------+--------------------+
| id|               items|          prediction|
+---+--------------------+--------------------+
|  0|[twitter, wendy, ...|              [said]|
|  1|[isn, president, ...|[one, people, don...|
|  2|[twitter, deplane...|   [said, president]|
|  3|[serve, number, t...|         [one, said]|
|  4|[environmental, w...|         [president]|
|  5|[number, needs, s...|  [one, said, trump]|
|  6|[twitter, deleted...|              [said]|
|  7|[looked, fully, a...|         [one, said]|
|  8|[beyond, senate, ...|              [said]|
|  9|[stroke, kinds, s...|         [one, said]|
+---+--------------------+--------------------+
only showing top 10 rows

In [ ]:
# checking for same consequent in the association rules
ass_rules_con = [data[0] for data in model.associationRules.select('consequent').collect()]

# creating a dictionary to count the occurence of the same world in the consequent of the association rules
dict_rules_con = {}
for x in ass_rules_con:
  k = tuple(x)
  if k not in dict_rules_con.keys():
    dict_rules_con[k] = 1
  else:
    dict_rules_con[k] += 1
#print(dict_rules_con)
In [ ]:
# checking for same antecedent in the association rules
ass_rules_ant = [data[0] for data in model.associationRules.select('antecedent').collect()]

# number of association rules found with a minumum support of 0,25
print(len(ass_rules_ant))

#creating a dictionary to count the occurence of the same world in the antecedent of the association rules
dict_rules = {}
for x in ass_rules_ant:
  k = tuple(x)
  if k not in dict_rules.keys():
    dict_rules[k] = 1
  else:
    dict_rules[k] += 1
#print(dict_rules)
31
In [ ]:
# checking for the top ten of frequent items
freq_item_data = [data[0] for data in model.freqItemsets.select('items').collect()] 
freq_item = [data[0] for data in model.freqItemsets.select('freq').collect()] 

sort_freq = sorted(freq_item)[len(freq_item)-10:]

most_used_item = []

for i in range(len(freq_item)):
  if freq_item[i] in sort_freq:
    most_used_item.append(freq_item_data[i])

#print(most_used_item)
In [ ]:
# checking for model with different minSup [0.05,0.95]
c = 0.05
for x in range(18):
  print("Modello con minSup a {}\n".format(c))
  fpGrowth = FPGrowth(itemsCol="items", minSupport=c, minConfidence=0.6)
  c += 0.05
  model = fpGrowth.fit(df)

  # Display frequent itemsets.
  model.freqItemsets.show()

  # Display generated association rules.
  model.associationRules.show()

  # transform examines the input items against all the association rules and summarize the
  # consequents as prediction
  model.transform(df).show()
Modello con minSup a 0.1

/usr/local/lib/python3.8/dist-packages/pyspark/sql/context.py:125: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(
+--------------+----+
|         items|freq|
+--------------+----+
|      [person]|2972|
|     [working]|2420|
|           [m]|3901|
|      [m, one]|2490|
|   [m, people]|2426|
|     [m, said]|2758|
|     [history]|2580|
|      [enough]|2694|
|       [clear]|2828|
|        [long]|3614|
|   [long, one]|2457|
|  [long, said]|2369|
|     [believe]|3241|
|        [need]|3694|
|   [need, one]|2416|
|[need, people]|2423|
|        [used]|3295|
|         [fox]|2621|
|  [department]|2755|
|      [public]|4035|
+--------------+----+
only showing top 20 rows

+--------------------+----------+------------------+------------------+-------------------+
|          antecedent|consequent|        confidence|              lift|            support|
+--------------------+----------+------------------+------------------+-------------------+
|[image, via, peop...|[featured]|0.9588957055214724| 2.652161336824709|0.13318563333475353|
|[image, via, peop...|  [donald]|0.8429447852760736|2.1684301901813594|0.11708065272037833|
|        [think, one]|    [said]| 0.726722398385702|1.3346558225751808|0.10740914319798901|
|        [think, one]|  [people]|0.6785817238397233|1.3930719531393463|0.10029397980486558|
|         [make, via]|     [one]|0.6685499058380414|1.2737669323747602|0.10587533552042946|
|         [make, via]|  [people]|0.6599408124831854|1.3548037094194738|0.10451195091815432|
|         [make, via]|    [said]|0.6693570083400592| 1.229301904753484|0.10600315282689277|
|         [make, via]|   [image]|0.6938391175679312| 1.811266591973853|0.10988027778961271|
|         [make, via]|[featured]|0.6620930858218994|1.8312499195528873|0.10485279706872311|
|[like, people, said]|     [via]|0.6187530925284512| 1.291370605969703|0.10655702782156704|
|[like, people, said]|     [one]|0.6971796140524493| 1.328314207437701|0.12006305653785522|
|[like, people, said]|   [trump]|0.6224641266699653|1.1666418204160949|0.10719611435388351|
|              [good]|     [one]|0.6586384861145245|1.2548830187185651|0.11418346044054364|
|              [good]|   [trump]|0.6151388547554681|1.1529125656764028|0.10664223935920923|
|              [good]|  [people]|0.6320963381666257|1.2976413148875074|0.10958203740786503|
|              [good]|    [said]|0.6333251413123617|1.1631278866778123|0.10979506625197051|
|[donald, presiden...|[featured]|0.8197492163009404|2.2673030704453656|0.11141408546717226|
|[donald, presiden...|   [trump]|0.9974921630094044|1.8695311473284142|0.13557155638873503|
|[donald, presiden...|   [image]|0.8539184952978056|2.2291537096134797|0.11605811426867198|
|        [made, said]|     [one]|0.6805134189031505|1.2965606343920648|0.12423842188232287|
+--------------------+----------+------------------+------------------+-------------------+
only showing top 20 rows

+---+--------------------+--------------------+
| id|               items|          prediction|
+---+--------------------+--------------------+
|  0|[star, rainyday, ...|[said, via, image...|
|  1|[fool, us, commit...|[one, donald, peo...|
|  2|[fwfxszupxy, clar...|[said, president,...|
|  3|[using, star, sin...|[one, said, via, ...|
|  4|[internationally,...|[via, president, ...|
|  5|[shared, posthast...|[trump, one, dona...|
|  6|[since, twitter, ...|[one, people, sai...|
|  7|[looked, despite,...|         [one, said]|
|  8|[looks, intent, p...|[said, states, wh...|
|  9|[break, anymore, ...|   [one, said, like]|
| 10|[pay, opinions, k...|[president, said,...|
| 11|[wait, mark, fami...|[donald, presiden...|
| 12|[looked, looks, b...|[people, via, ima...|
| 13|[concerns, using,...|[donald, one, peo...|
| 14|[health, pay, bre...|[said, president,...|
| 15|[invited, therapy...|[people, presiden...|
| 16|[accurate, mark, ...|    [featured, said]|
| 17|[brings, possibly...|[one, people, via...|
| 18|[despite, homosex...|            [donald]|
| 19|[wait, texeye, wi...|[one, said, donal...|
+---+--------------------+--------------------+
only showing top 20 rows

Modello con minSup a 0.2

+-----------------+-----+
|            items| freq|
+-----------------+-----+
|       [campaign]| 5153|
|          [white]| 5935|
|         [called]| 4748|
|          [think]| 5376|
|           [also]| 8993|
|      [also, one]| 5786|
|[also, president]| 4793|
|      [also, via]| 5068|
|    [also, trump]| 5251|
|   [also, people]| 5177|
|     [also, said]| 5929|
|        [america]| 6230|
|            [com]| 4947|
|          [house]| 5689|
|           [know]| 6552|
|            [one]|12319|
|     [one, trump]| 7117|
|      [one, said]| 7798|
|           [news]| 7181|
|           [much]| 5305|
+-----------------+-----+
only showing top 20 rows

+--------------------+-----------+------------------+------------------+-------------------+
|          antecedent| consequent|        confidence|              lift|            support|
+--------------------+-----------+------------------+------------------+-------------------+
|             [trump]|[president]|  0.61151481274455|1.3627861915996329| 0.3262749776319714|
|             [trump]|   [donald]|0.7249061726423381|1.8647821983875843|0.38677516935793105|
|[featured, via, t...|    [image]| 0.989423076923077| 2.582888337055004|0.21920668058455114|
|              [even]|      [one]|0.6543945769050958|1.2467972330984256|0.23854969962932981|
|              [even]|      [via]|0.6034361851332398| 1.259403405767586|0.21997358442333093|
|              [even]|    [trump]|0.6339410939691444|1.1881523130679381| 0.2310936900856376|
|              [even]|   [people]|0.6144226273959794|1.2613586536876613|0.22397852669251417|
|              [even]|     [said]|0.6386161757830762|1.1728450909080266|0.23279792083848153|
|  [featured, donald]|    [trump]|0.9973539973539973|1.8692721929166871|0.22483064206893613|
|  [featured, donald]|    [image]|0.9599319599319599|2.5059017941900823|0.21639469984235865|
|  [president, trump]|     [said]|0.6395925829198225| 1.174638303107289|0.20868305568573986|
|  [president, trump]|   [donald]|0.7397492817968138|1.9029652995454862| 0.2413616803715223|
|    [featured, said]|    [image]|0.9571180220204751|2.4985560109935014|0.21111158450854245|
|       [people, one]|     [said]|0.6607941834451901|1.2135759217247306|0.20135486344851092|
|  [featured, people]|    [image]|0.9568050642338485|2.4977390348829562|0.21895104597162457|
|               [new]|      [one]| 0.626998933901919| 1.194601183343773|0.20046014230326786|
|               [new]|     [said]|0.6332622601279317| 1.163012402774858|0.20246261343785948|
|              [like]|      [one]| 0.631547736706859| 1.203267873061668|0.27225086276681865|
|              [like]|    [trump]|0.6058509586874876|1.1355048990939889|0.26117336287333304|
|              [like]|   [people]|0.6127693220003954|1.2579645549436962| 0.2641557666908099|
+--------------------+-----------+------------------+------------------+-------------------+
only showing top 20 rows

+---+--------------------+--------------------+
| id|               items|          prediction|
+---+--------------------+--------------------+
|  0|[star, rainyday, ...|         [via, said]|
|  1|[fool, us, commit...|[donald, one, peo...|
|  2|[fwfxszupxy, clar...|[president, via, ...|
|  3|[using, star, sin...|    [one, via, said]|
|  4|[internationally,...|    [president, via]|
|  5|[shared, posthast...|[one, trump, said...|
|  6|[since, twitter, ...|              [said]|
|  7|[looked, despite,...|         [said, one]|
|  8|[looks, intent, p...|              [said]|
|  9|[break, anymore, ...|         [one, said]|
| 10|[pay, opinions, k...|   [president, said]|
| 11|[wait, mark, fami...|[president, donal...|
| 12|[looked, looks, b...|            [people]|
| 13|[concerns, using,...|[donald, one, peo...|
| 14|[health, pay, bre...|[president, via, ...|
| 15|[invited, therapy...| [president, people]|
| 16|[accurate, mark, ...|    [said, featured]|
| 17|[brings, possibly...|  [one, via, people]|
| 18|[despite, homosex...|            [donald]|
| 19|[wait, texeye, wi...|[donald, one, via...|
+---+--------------------+--------------------+
only showing top 20 rows

Modello con minSup a 0.30000000000000004

+------------------+-----+
|             items| freq|
+------------------+-----+
|            [also]| 8993|
|             [one]|12319|
|      [one, trump]| 7117|
|       [one, said]| 7798|
|            [news]| 7181|
|             [via]|11246|
|       [via, said]| 7238|
|            [time]| 8585|
|        [featured]| 8486|
| [featured, image]| 8105|
|            [like]|10118|
|            [said]|12780|
|            [even]| 8556|
|       [president]|10532|
|[president, trump]| 7658|
|             [new]| 7504|
|          [donald]| 9124|
|   [donald, trump]| 9078|
|           [trump]|12523|
|     [trump, said]| 7428|
+------------------+-----+
only showing top 20 rows

+-----------+-----------+------------------+------------------+-------------------+
| antecedent| consequent|        confidence|              lift|            support|
+-----------+-----------+------------------+------------------+-------------------+
|    [trump]|[president]|  0.61151481274455|1.3627861915996329| 0.3262749776319714|
|    [trump]|   [donald]|0.7249061726423381|1.8647821983875843|0.38677516935793105|
|      [via]|     [said]|0.6436066156855771|1.1820102407477449|0.30838055472711007|
|      [via]|    [image]|0.6546327583140672|1.7089184151250663|0.31366367006092627|
|   [donald]|    [trump]|0.9949583516001753| 1.864782198387584|0.38677516935793105|
|      [one]|     [said]|0.6330059258056661| 1.162541634161564|0.33223978526692516|
|   [people]|      [one]|0.6255575964313829|1.1918550487735196| 0.3047164586084956|
|   [people]|     [said]|0.6326423510889531|1.1618739141164958|0.30816752588300456|
|    [image]| [featured]|0.9014570125681237|2.4932945489024783|0.34531975629500233|
|    [image]|      [via]|0.8188188188188188|1.7089184151250663|0.31366367006092627|
| [featured]|    [image]|0.9551025218006127|2.4932945489024783|0.34531975629500233|
|     [said]|      [one]|0.6101721439749609| 1.162541634161564|0.33223978526692516|
|[president]|    [trump]|0.7271173566274212|1.3627861915996329| 0.3262749776319714|
+-----------+-----------+------------------+------------------+-------------------+

+---+--------------------+--------------------+
| id|               items|          prediction|
+---+--------------------+--------------------+
|  0|[star, rainyday, ...|              [said]|
|  1|[fool, us, commit...|       [donald, one]|
|  2|[fwfxszupxy, clar...|   [president, said]|
|  3|[using, star, sin...|         [one, said]|
|  4|[internationally,...|         [president]|
|  5|[shared, posthast...|         [said, one]|
|  6|[since, twitter, ...|                  []|
|  7|[looked, despite,...|         [said, one]|
|  8|[looks, intent, p...|              [said]|
|  9|[break, anymore, ...|         [said, one]|
| 10|[pay, opinions, k...|   [president, said]|
| 11|[wait, mark, fami...|[president, donal...|
| 12|[looked, looks, b...|                  []|
| 13|[concerns, using,...|       [donald, one]|
| 14|[health, pay, bre...|   [president, said]|
| 15|[invited, therapy...|         [president]|
| 16|[accurate, mark, ...|    [said, featured]|
| 17|[brings, possibly...|               [one]|
| 18|[despite, homosex...|            [donald]|
| 19|[wait, texeye, wi...| [donald, one, said]|
+---+--------------------+--------------------+
only showing top 20 rows

Modello con minSup a 0.4

+-----------+-----+
|      items| freq|
+-----------+-----+
|      [one]|12319|
|      [via]|11246|
|     [like]|10118|
|     [said]|12780|
|[president]|10532|
|    [trump]|12523|
|   [people]|11433|
+-----------+-----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

+---+--------------------+----------+
| id|               items|prediction|
+---+--------------------+----------+
|  0|[star, rainyday, ...|        []|
|  1|[fool, us, commit...|        []|
|  2|[fwfxszupxy, clar...|        []|
|  3|[using, star, sin...|        []|
|  4|[internationally,...|        []|
|  5|[shared, posthast...|        []|
|  6|[since, twitter, ...|        []|
|  7|[looked, despite,...|        []|
|  8|[looks, intent, p...|        []|
|  9|[break, anymore, ...|        []|
| 10|[pay, opinions, k...|        []|
| 11|[wait, mark, fami...|        []|
| 12|[looked, looks, b...|        []|
| 13|[concerns, using,...|        []|
| 14|[health, pay, bre...|        []|
| 15|[invited, therapy...|        []|
| 16|[accurate, mark, ...|        []|
| 17|[brings, possibly...|        []|
| 18|[despite, homosex...|        []|
| 19|[wait, texeye, wi...|        []|
+---+--------------------+----------+
only showing top 20 rows

Modello con minSup a 0.5

+-------+-----+
|  items| freq|
+-------+-----+
|  [one]|12319|
| [said]|12780|
|[trump]|12523|
+-------+-----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

+---+--------------------+----------+
| id|               items|prediction|
+---+--------------------+----------+
|  0|[star, rainyday, ...|        []|
|  1|[fool, us, commit...|        []|
|  2|[fwfxszupxy, clar...|        []|
|  3|[using, star, sin...|        []|
|  4|[internationally,...|        []|
|  5|[shared, posthast...|        []|
|  6|[since, twitter, ...|        []|
|  7|[looked, despite,...|        []|
|  8|[looks, intent, p...|        []|
|  9|[break, anymore, ...|        []|
| 10|[pay, opinions, k...|        []|
| 11|[wait, mark, fami...|        []|
| 12|[looked, looks, b...|        []|
| 13|[concerns, using,...|        []|
| 14|[health, pay, bre...|        []|
| 15|[invited, therapy...|        []|
| 16|[accurate, mark, ...|        []|
| 17|[brings, possibly...|        []|
| 18|[despite, homosex...|        []|
| 19|[wait, texeye, wi...|        []|
+---+--------------------+----------+
only showing top 20 rows

Modello con minSup a 0.6

+-----+----+
|items|freq|
+-----+----+
+-----+----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

+---+--------------------+----------+
| id|               items|prediction|
+---+--------------------+----------+
|  0|[star, rainyday, ...|        []|
|  1|[fool, us, commit...|        []|
|  2|[fwfxszupxy, clar...|        []|
|  3|[using, star, sin...|        []|
|  4|[internationally,...|        []|
|  5|[shared, posthast...|        []|
|  6|[since, twitter, ...|        []|
|  7|[looked, despite,...|        []|
|  8|[looks, intent, p...|        []|
|  9|[break, anymore, ...|        []|
| 10|[pay, opinions, k...|        []|
| 11|[wait, mark, fami...|        []|
| 12|[looked, looks, b...|        []|
| 13|[concerns, using,...|        []|
| 14|[health, pay, bre...|        []|
| 15|[invited, therapy...|        []|
| 16|[accurate, mark, ...|        []|
| 17|[brings, possibly...|        []|
| 18|[despite, homosex...|        []|
| 19|[wait, texeye, wi...|        []|
+---+--------------------+----------+
only showing top 20 rows

Modello con minSup a 0.7

+-----+----+
|items|freq|
+-----+----+
+-----+----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

+---+--------------------+----------+
| id|               items|prediction|
+---+--------------------+----------+
|  0|[star, rainyday, ...|        []|
|  1|[fool, us, commit...|        []|
|  2|[fwfxszupxy, clar...|        []|
|  3|[using, star, sin...|        []|
|  4|[internationally,...|        []|
|  5|[shared, posthast...|        []|
|  6|[since, twitter, ...|        []|
|  7|[looked, despite,...|        []|
|  8|[looks, intent, p...|        []|
|  9|[break, anymore, ...|        []|
| 10|[pay, opinions, k...|        []|
| 11|[wait, mark, fami...|        []|
| 12|[looked, looks, b...|        []|
| 13|[concerns, using,...|        []|
| 14|[health, pay, bre...|        []|
| 15|[invited, therapy...|        []|
| 16|[accurate, mark, ...|        []|
| 17|[brings, possibly...|        []|
| 18|[despite, homosex...|        []|
| 19|[wait, texeye, wi...|        []|
+---+--------------------+----------+
only showing top 20 rows

Modello con minSup a 0.7999999999999999

+-----+----+
|items|freq|
+-----+----+
+-----+----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

+---+--------------------+----------+
| id|               items|prediction|
+---+--------------------+----------+
|  0|[star, rainyday, ...|        []|
|  1|[fool, us, commit...|        []|
|  2|[fwfxszupxy, clar...|        []|
|  3|[using, star, sin...|        []|
|  4|[internationally,...|        []|
|  5|[shared, posthast...|        []|
|  6|[since, twitter, ...|        []|
|  7|[looked, despite,...|        []|
|  8|[looks, intent, p...|        []|
|  9|[break, anymore, ...|        []|
| 10|[pay, opinions, k...|        []|
| 11|[wait, mark, fami...|        []|
| 12|[looked, looks, b...|        []|
| 13|[concerns, using,...|        []|
| 14|[health, pay, bre...|        []|
| 15|[invited, therapy...|        []|
| 16|[accurate, mark, ...|        []|
| 17|[brings, possibly...|        []|
| 18|[despite, homosex...|        []|
| 19|[wait, texeye, wi...|        []|
+---+--------------------+----------+
only showing top 20 rows

Modello con minSup a 0.8999999999999999

+-----+----+
|items|freq|
+-----+----+
+-----+----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

+---+--------------------+----------+
| id|               items|prediction|
+---+--------------------+----------+
|  0|[star, rainyday, ...|        []|
|  1|[fool, us, commit...|        []|
|  2|[fwfxszupxy, clar...|        []|
|  3|[using, star, sin...|        []|
|  4|[internationally,...|        []|
|  5|[shared, posthast...|        []|
|  6|[since, twitter, ...|        []|
|  7|[looked, despite,...|        []|
|  8|[looks, intent, p...|        []|
|  9|[break, anymore, ...|        []|
| 10|[pay, opinions, k...|        []|
| 11|[wait, mark, fami...|        []|
| 12|[looked, looks, b...|        []|
| 13|[concerns, using,...|        []|
| 14|[health, pay, bre...|        []|
| 15|[invited, therapy...|        []|
| 16|[accurate, mark, ...|        []|
| 17|[brings, possibly...|        []|
| 18|[despite, homosex...|        []|
| 19|[wait, texeye, wi...|        []|
+---+--------------------+----------+
only showing top 20 rows